diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a40f42..f7643ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.13) +cmake_minimum_required(VERSION 3.18) project(cuASR CUDA CXX) # RELEASE config by default if none is provided: @@ -22,13 +22,15 @@ option(CUASR_TEST "Build cuASR test suite. Use with CUASR_TEST_LEVEL={0|1|2}. option(CUASR_BENCH "Build cuASR benchmark suite." ON) option(CUASR_EXAMPLE "Build cuASR examples." ON) -# By default, build fat binaries. TODO add sm_80 here -option(CUASR_CUDA_ARCHS "List of CUDA architectures to compile for." "60 61 70 72 75") - # CUDA native compiler (nvcc) only supports upto C++14 for now find_package(CUDA REQUIRED) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) # C++ compiler flags for target compile options set(cuASR_CXX_FLAGS -Wall -Wextra -Wno-unused-parameter -Wno-uninitialized -Wno-strict-aliasing) @@ -41,7 +43,11 @@ set(cuASR_CUDA_FLAGS --expt-relaxed-constexpr) set(cuASR_CUDA_FLAGS_DEBUG -G ${cuASR_CUDA_FLAGS}) set(cuASR_CUDA_FLAGS_RELEASE -O3 ${cuASR_CUDA_FLAGS}) set(cuASR_CUDA_FLAGS_RELWITHDEBINFO -G ${cuASR_CUDA_FLAGS}) -set(CMAKE_CUDA_ARCHITECTURES ${CUASR_CUDA_ARCHS}) +if(NOT DEFINED CUASR_CUDA_ARCHS) + set(CMAKE_CUDA_ARCHITECTURES 80) +else() + set(CMAKE_CUDA_ARCHITECTURES ${CUASR_CUDA_ARCHS}) +endif() # the sub-modules update themselves with git, so find git find_package(Git QUIET) @@ -94,6 +100,7 @@ message(STATUS " C++ Compiler : ${CMAKE_CXX_COMPILER}") message(STATUS " C++ Compiler version : ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS " CUDA Compiler : ${CMAKE_CUDA_COMPILER}") message(STATUS " CUDA Compiler version: ${CMAKE_CUDA_COMPILER_VERSION}") +message(STATUS " CUDA Arch support : ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS " Build tests : ${CUASR_TEST}") message(STATUS " Test level : ${CUASR_TEST_LEVEL}") message(STATUS " Build benchmarks : ${CUASR_BENCH}") diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..eb8817d --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,28 @@ +Copyright (c) 2020 - 2022 Vijay Thakkar. +Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: BSD-3-Clause + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 96235ca..0ceb363 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ Normally, BLAS libraries are defined as operations over real numbers,`+` and `x` cuASR is a template library and therefore header only, but includes an exhaustive list of tests and benchmarks. The build system is based on `CMake`. Basic checkout and build instructions are as follows: ```sh -$ git clone --recurse-submodules https://github.com/hpcgarage/semiring-gemm /path/to/repo +$ git clone --recurse-submodules https://github.com/hpcgarage/cuASR /path/to/repo $ cd /path/to/repo $ mkdir build && cd build -$ cmake .. -G Ninja -DCUASR_CUDA_ARCHS="70 75" +$ cmake .. -G Ninja -DCMAKE_CUDA_ARCHITECTURES="70 75 80" $ ninja ``` @@ -34,7 +34,7 @@ Notable build flags: | Build Flag | Usage Description | |-|-| -| `CUASR_CUDA_ARCHS` | lists the CUDA SM architectures the fat binaries should be built to target. `CUASR_CUDA_ARCHS="60 61 70 72 75"` (all Pascal and Volta GPUs) will be used if no value is specified, but this can really hurt compile times for tests and benchmarks; Limit CUDA architectures to the smallest subset you forsee running the tests and benchmarks on. +| `CMAKE_CUDA_ARCHITECTURES` | lists the CUDA SM architectures the fat binaries should be built to target. `CMAKE_CUDA_ARCHITECTURES="80"` (Ampere) will be used if no value is specified, but this can really hurt compile times for tests and benchmarks; Limit CUDA architectures to the smallest subset you forsee running the tests and benchmarks on. | `CUASR_TEST` | Set to `ON` by default and controls whether tests will be built or not. Set to `OFF` to disable building all tests. | | `CUASR_BENCH` | Set to `ON` by default and controls whether benchmarks will be built or not. Set to `OFF` to disable building all benchmarks. | | `CUASR_EXAMPLES` | Set to `ON` by default and controls whether examples will be built or not. Set to `OFF` to disable building all examples. | @@ -92,21 +92,19 @@ auto cuasr_minplus_srsgemm_nt( bool do_epilogue_min, cudaStream_t stream = nullptr) -> int { // compile time configuration of this srgemm kernel - using OperatorClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - using AdditionOp = cuasr::minimum; - using MultiplicationOp = cuasr::plus; + using OperatorClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using TropicalConfig = typename cuasr::gemm::device::DefaultSemiRingConfiguration< float, float, float, float, OperatorClass, // - AdditionOp, MultiplicationOp, SmArch>; + RingOp, SmArch>; using ColumnMajor = cutlass::layout::ColumnMajor; using RowMajor = cutlass::layout::RowMajor; using cuASR_MinPlus_SGEMM = cuasr::gemm::device::Srgemm< - AdditionOp, // Thread level semiring add operator - MultiplicationOp, // Thread level semiRing multiply operator + RingOp, // Thread level SemiRing operator float, // element type of A ColumnMajor, // layout of A float, // element type of B @@ -114,11 +112,10 @@ auto cuasr_minplus_srsgemm_nt( float, // element type of C RowMajor, // layout of C float // element type of D - >; + >; - float alpha = MultiplicationOp::Identity; - float beta - = do_epilogue_min ? MultiplicationOp::Identity : MultiplicationOp::Annihilator; + int alpha = RingOp::MultIdentity; + int beta = do_epilogue_min ? RingOp::MultIdentity : RingOp::MultAnnihilator; // construct kernel arguments struct cuASR_MinPlus_SGEMM::Arguments args( @@ -161,28 +158,26 @@ After the operator struct is defined, the rest is some simple boilerplate for in The code excerpt below is taken from [`examples/01_userdefined_semiring`](examples/01_userdefined_semiring/userdefined_semiring.cu). ```cpp -template -struct binary_xor { - static T constexpr Identity = static_cast(false); - // scalar operator +template +struct xor_and { + static T constexpr AddIdentity = static_cast(false); + static T constexpr MultIdentity = static_cast(true); + static T constexpr MultAnnihilator = static_cast(false); + __host__ __device__ - T operator()(T lhs, T const &rhs) const { - lhs ^= rhs; - return lhs; + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); } __host__ __device__ - cutlass::Array - operator()(cutlass::Array const &lhs, cutlass::Array const &rhs) const { - cutlass::Array result; - #pragma unroll - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], rhs[i]); - } - return result; + T add(T const lhs, T const rhs) const { + return lhs ^ rhs; } - // ... other overloads for cutlass::Array here ... + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return lhs && rhs; + } }; // GF(2) xor-and SRGEMM @@ -203,10 +198,9 @@ auto cuasr_gf_srgemm_nnn( using OperatorClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; - using AdditionOp = binary_xor; - using MultiplicationOp = cuasr::binary_and; + using RingOp = xor_and; using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, int, 1>; + RingOp, int, 1>; static int constexpr AlignmentA = 1; static int constexpr AlignmentB = 1; @@ -220,8 +214,7 @@ auto cuasr_gf_srgemm_nnn( using RowMajor = cutlass::layout::RowMajor; using cuASRGaloisFieldSrgemm = cuasr::gemm::device::Srgemm< - AdditionOp, // Thread level SemiRing operator - MultiplicationOp, // Thread level SemiRing operator + RingOp, // Thread level SemiRing operator int, // element type of A RowMajor, // layout of A int, // element type of B @@ -242,8 +235,8 @@ auto cuasr_gf_srgemm_nnn( false // SplitKSerial >; - int alpha = MultiplicationOp::Identity; - int beta = do_epilogue_and ? MultiplicationOp::Identity : MultiplicationOp::Annihilator; + int alpha = RingOp::MultIdentity; + int beta = do_epilogue_and ? RingOp::MultIdentity : RingOp::MultAnnihilator; // construct kernel arguments struct cuASRGaloisFieldSrgemm::Arguments args( @@ -291,24 +284,30 @@ When a device level SRGEMM template, `cuasr::gemm::device::Srgemm`, is instantia namespace cuasr::arch { template < // ... datatype and GEMM shape template params - typename AdditionOp, - typename MultiplicationOp + typename RingOp > -struct Srmma { +struct Srmma< + cutlass::gemm::GemmShape<1, 1, 1>, + 1, + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + RingOp> { using Shape = cutlass::gemm::GemmShape<1, 1, 1>; - // operators must be default contructible and contain a binary operator() - AdditionOp add; - MultiplicationOp mult; + RingOp ring_op; - __host__ __device__ + CUTLASS_HOST_DEVICE void operator()( cutlass::Array &d, cutlass::Array const &a, cutlass::Array const &b, cutlass::Array const &c ) { - d[0] = add(c[0], mult(a[0], b[0])); + ring_op.fma(d[0], a[0], b[0], c[0]); } }; } diff --git a/bench/device/CMakeLists.txt b/bench/device/CMakeLists.txt index c1d1183..a10eeab 100644 --- a/bench/device/CMakeLists.txt +++ b/bench/device/CMakeLists.txt @@ -1,16 +1,53 @@ -file(GLOB SIMT_BENCH_SRCS CONFIGURE_DEPENDS *.cu) -add_executable(cuasr_bench_srgemm_device +# SM50 defualt configurations +add_executable(cuasr_bench_srgemm_device_sm50_defaults + sm50_defaults.cu +) +target_include_directories( + cuasr_bench_srgemm_device_sm50_defaults + PRIVATE + ${PROJECT_SOURCE_DIR}/include/ + ${PROJECT_SOURCE_DIR}/tools/include/ + ${PROJECT_SOURCE_DIR}/cutlass/include/ + ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/ +) +target_link_libraries(cuasr_bench_srgemm_device_sm50_defaults + benchmark + benchmark_main + ${cuASR_LIB_NAME} +) + +# SM80 defualt configurations +add_executable(cuasr_bench_srgemm_device_sm80_defaults + sm80_defaults.cu +) +target_include_directories( + cuasr_bench_srgemm_device_sm80_defaults + PRIVATE + ${PROJECT_SOURCE_DIR}/include/ + ${PROJECT_SOURCE_DIR}/tools/include/ + ${PROJECT_SOURCE_DIR}/cutlass/include/ + ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/ +) +target_link_libraries(cuasr_bench_srgemm_device_sm80_defaults + benchmark + benchmark_main + ${cuASR_LIB_NAME} +) + +# All shmoo benchmarks +file(GLOB SIMT_BENCH_SRCS CONFIGURE_DEPENDS sm50_simt_*.cu) +add_executable(cuasr_bench_srgemm_device_shmoo ${SIMT_BENCH_SRCS} ) target_include_directories( - cuasr_bench_srgemm_device + cuasr_bench_srgemm_device_shmoo PRIVATE ${PROJECT_SOURCE_DIR}/include/ ${PROJECT_SOURCE_DIR}/tools/include/ ${PROJECT_SOURCE_DIR}/cutlass/include/ ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/ ) -target_link_libraries(cuasr_bench_srgemm_device +target_link_libraries(cuasr_bench_srgemm_device_shmoo benchmark benchmark_main ${cuASR_LIB_NAME} @@ -18,6 +55,6 @@ target_link_libraries(cuasr_bench_srgemm_device if(NOT DEFINED CUASR_BENCH_LEVEL) set(CUASR_BENCH_LEVEL 0) endif() -target_compile_definitions(cuasr_bench_srgemm_device +target_compile_definitions(cuasr_bench_srgemm_device_shmoo PRIVATE CUASR_BENCH_LEVEL=${CUASR_BENCH_LEVEL} ) diff --git a/bench/device/gen_default_bench.py b/bench/device/gen_default_bench.py new file mode 100644 index 0000000..34ed62e --- /dev/null +++ b/bench/device/gen_default_bench.py @@ -0,0 +1,173 @@ +import os +import sys +import argparse + +################################################################################ +# This file creates all the possible semiring-gemm kernels for all tnspposes +# using just the defualt SRGEMM configurations for them. +################################################################################ + +precisions = [ + ["f64", "double"], + ["f32", "float"], + ["s32", "int"] +] + +tnspposes = [ + [False, False, True], + [False, False, False], + [False, True, True], + [False, True, False], + [True, False, True], + [True, False, False], + [True, True, True], + [True, True, False], +] + +semiring_operators = [ + ["plus", "mult"], # regular GEMM + ["min", "plus"], # min-plus (tropical) + ["max", "plus"], # max-plus + ["min", "max"], # min-max + ["max", "min"], # max-min + ["min", "mult"], # min-multiplies + ["max", "mult"], # max-multiplies + ["or", "and"] # or-and +] + +benchfile_header = """\ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" +""" + +bench_template = """\ + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM{sm_arch}_default_{add_op}_{mult_op}_{precision_char}_srgemm_{tnspA}{tnspB}_{tnspC}(benchmark::State &state) {{ + const auto N = static_cast(state.range(0)); + using precision = {precision_type}; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm{sm_arch}; + using RingOp = cuasr::{add_op}_{mult_op}; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::{tnsp_typeA}Major, // + precision, cutlass::layout::{tnsp_typeB}Major, // + precision, cutlass::layout::{tnsp_typeC}Major, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({{ N, N, N }}); + + // benchmark loop + for (auto _ : state) {{ + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + }} + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +}} +BENCHMARK(BM_SM{sm_arch}_default_{add_op}_{mult_op}_{precision_char}_srgemm_{tnspA}{tnspB}_{tnspC}) + ->RangeMultiplier(2)->Range(256, 4096); +""" + + +def write_benchmark_file_header(benchfile): + benchfile.write(benchfile_header) + + +def write_benchmark_to_file( + benchfile, + sm_arch, + add_op, + mult_op, + precision_char, + precision_type, + tnspA, + tnspB, + tnspC): + tnsp_typeA = "Column" if tnspA == "n" else "Row" + tnsp_typeB = "Column" if tnspB == "n" else "Row" + tnsp_typeC = "Column" if tnspC == "n" else "Row" + benchfile.write(bench_template.format( + sm_arch=sm_arch, + add_op=add_op, + mult_op=mult_op, + precision_char=precision_char, + precision_type=precision_type, + tnspA=tnspA, + tnspB=tnspB, + tnspC=tnspC, + tnsp_typeA=tnsp_typeA, + tnsp_typeB=tnsp_typeB, + tnsp_typeC=tnsp_typeC + )) + + +def main(args): + num_benches = 0 + benchfile_name = "sm{}_defaults.cu".format(args.sm_arch) + print(benchfile_name) + filePath = os.path.join(args.output_dir, benchfile_name) + + # open file and gen all default tests + with open(filePath, "w") as benchfile: + write_benchmark_file_header(benchfile) + + # for all semirings + for add_op, mult_op in semiring_operators: + # for all precisions + for precision in precisions: + precision_char = precision[0] + precision_type = precision[1] + + # tnspposes + for tnsppose in tnspposes: + # get tnsppose char + column_major_A = tnsppose[0] + column_major_B = tnsppose[1] + column_major_C = tnsppose[2] + tnspA = "n" if column_major_A else "t" + tnspB = "n" if column_major_B else "t" + tnspC = "n" if column_major_C else "t" + + # write to file + write_benchmark_to_file( + benchfile, + args.sm_arch, + add_op, + mult_op, + precision_char, + precision_type, + tnspA, + tnspB, + tnspC) + num_benches += 1 + print("Total bench count per semi-ring = {}".format( + num_benches // len(semiring_operators))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--output-dir", type=str, required=False, default=".", + help="Path to the output dir.") + parser.add_argument("-sm", "--sm-arch", type=int, required=False, default=50, choices=[50, 80], + help="SM architecture version number,") + args = parser.parse_args(sys.argv[1:]) + main(args) diff --git a/bench/device/simt_sm50.py b/bench/device/gen_simt.py similarity index 80% rename from bench/device/simt_sm50.py rename to bench/device/gen_simt.py index bfe6065..1a4c199 100644 --- a/bench/device/simt_sm50.py +++ b/bench/device/gen_simt.py @@ -1,4 +1,6 @@ import os +import sys +import argparse # this file creates the bench/unit/gemm/device simt benchmarks and the CMake file to go with it ################################################################################ @@ -25,10 +27,9 @@ # char, type bits/elem, max tile, L0 threadblock tiles precisions = [ - ["d", "double", 64, 64*64, [[64, 64], [32, 32]]], - ["s", "float", 32, 128 * - 128, [[128, 256], [128, 128], [64, 64]]], - # ["h", "cutlass::half_t", 16, 128*256, [ [256, 128], [ 64, 128], [ 64, 32] ] ], + ["f64", "double", 64, 128 * 64, [[128, 64], [ 64, 64], [ 32, 32]]], + ["f32", "float", 32, 256 * 128, [[256, 128], [128, 128], [128, 64], [64, 64]]], + # ["h", "cutlass::half_t", 16, 128*256, [[256, 128], [ 64, 128], [ 64, 32] ] ], # ["i", "int", 32, 128*128, [[128, 64], [16, 32]]], ] @@ -44,19 +45,19 @@ ] semiring_operators = [ - ["plus", "multiplies"], # regular GEMM - ["minimum", "plus"], # min-plus (tropical) - ["maximum", "plus"], # max-plus - ["minimum", "maximum"], # min-max - ["maximum", "minimum"], # max-min - ["minimum", "multiplies"], # min-multiplies - ["maximum", "multiplies"], # max-multiplies - ["binary_or", "binary_and"] # or-and + ["plus", "mult"], # regular GEMM + ["min", "plus"], # min-plus (tropical) + ["max", "plus"], # max-plus + ["min", "max"], # min-max + ["max", "min"], # max-min + ["min", "mult"], # min-multiplies + ["max", "mult"], # max-multiplies + ["or", "and"] # or-and ] benchfile_header = """\ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -83,26 +84,25 @@ bench_template = """\ #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= {21}) -static void BM_SM50_device_{0}_{1}_{2}srgemm_{4}{5}_{6}_{10}x{11}x{12}_{13}x{14}x1_{15}x{16}_{17}x{18}_{19}x{20}(benchmark::State &state) {{ +static void BM_SM{22}_device_{0}_{1}_{2}_srgemm_{4}{5}_{6}_{10}x{11}x{12}_{13}x{14}x1_{15}x{16}_{17}x{18}_{19}x{20}(benchmark::State &state) {{ const auto N = static_cast(state.range(0)); using precision = {3}; using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; + using SmArch = cutlass::arch::Sm{22}; + using RingOp = cuasr::{0}_{1}; using ThreadblockShape = cutlass::gemm::GemmShape<{10}, {11}, {12}>; using WarpShape = cutlass::gemm::GemmShape<{13}, {14}, {12}>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::{0}, cuasr::{1}, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::{7}Major, // precision, cutlass::layout::{8}Major, // precision, cutlass::layout::{9}Major, // @@ -123,7 +123,7 @@ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); }} -BENCHMARK(BM_SM50_device_{0}_{1}_{2}srgemm_{4}{5}_{6}_{10}x{11}x{12}_{13}x{14}x1_{15}x{16}_{17}x{18}_{19}x{20}) +BENCHMARK(BM_SM{22}_device_{0}_{1}_{2}_srgemm_{4}{5}_{6}_{10}x{11}x{12}_{13}x{14}x1_{15}x{16}_{17}x{18}_{19}x{20}) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -140,9 +140,9 @@ def write_benchmark_to_file( mult_op, precision_char, precision_type, - transA, - transB, - transC, + tnspA, + tnspB, + tnspC, threadblock_tile, unroll, warp_shape, @@ -151,7 +151,8 @@ def write_benchmark_to_file( warp_threadsM, warp_threadsN, warps_per_tb, - bench_level): + bench_level, + sm_arch): print("{:.0f}x{:.0f}x{:.0f}__{:.0f}x{:.0f}_{:.0f}x{:.0f}_{:.0f}x{:.0f}".format( threadblock_tile[0], threadblock_tile[1], unroll, thread_tileM, thread_tileN, @@ -165,21 +166,21 @@ def write_benchmark_to_file( threadblock_tile[0], threadblock_tile[1], unroll )) - trans_typeA = "Column" if transA == "n" else "Row" - trans_typeB = "Column" if transB == "n" else "Row" - trans_typeC = "Column" if transC == "n" else "Row" + tnsp_typeA = "Column" if tnspA == "n" else "Row" + tnsp_typeB = "Column" if tnspB == "n" else "Row" + tnsp_typeC = "Column" if tnspC == "n" else "Row" print(precision_type) benchfile.write(bench_template.format( add_op, # 0 mult_op, # 1 precision_char, # 2 precision_type, # 3 - transA, # 4 - transB, # 5 - transC, # 6 - trans_typeA, # 7 - trans_typeB, # 8 - trans_typeC, # 9 + tnspA, # 4 + tnspB, # 5 + tnspC, # 6 + tnsp_typeA, # 7 + tnsp_typeB, # 8 + tnsp_typeC, # 9 int(threadblock_tile[0]), # 10 int(threadblock_tile[1]), # 11 int(unroll), # 12 @@ -191,11 +192,12 @@ def write_benchmark_to_file( int(warp_threadsN), # 18 int(warps_per_tb[0]), # 19 int(warps_per_tb[1]), # 20 - int(bench_level) # 21 + int(bench_level), # 21 + int(sm_arch) # 22 )) -def main(output_dir: str): +def main(args): # warps per threadblock warps_per_threadblocks = [] for warps_per_tb0 in WARPS_PER_TB_EDGE: @@ -237,22 +239,22 @@ def main(output_dir: str): tb_tiles_L0 = precision[4] # transposes - for transpose in transposes: - # get transpose char - column_major_A = transpose[0] - column_major_B = transpose[1] - column_major_C = transpose[2] - transA = "n" if column_major_A else "t" - transB = "n" if column_major_B else "t" - transC = "n" if column_major_C else "t" + for tnsppose in transposes: + # get tnsppose char + column_major_A = tnsppose[0] + column_major_B = tnsppose[1] + column_major_C = tnsppose[2] + tnspA = "n" if column_major_A else "t" + tnspB = "n" if column_major_B else "t" + tnspC = "n" if column_major_C else "t" # open file - benchfile_name = "simt_{}_{}_{}srgemm_{}{}_{}_sm50.cu".format( - add_op, mult_op, precision_char, - transA, transB, transC) + benchfile_name = "sm{}_simt_{}_{}_{}_srgemm_{}{}_{}.cu".format( + args.sm_arch, add_op, mult_op, precision_char, + tnspA, tnspB, tnspC) print("\n", benchfile_name) - filePath = os.path.join(output_dir, benchfile_name) + filePath = os.path.join(args.output_dir, benchfile_name) with open(filePath, "w") as benchfile: write_benchmark_file_header(benchfile) @@ -356,9 +358,9 @@ def main(output_dir: str): mult_op, precision_char, precision_type, - transA, - transB, - transC, + tnspA, + tnspB, + tnspC, threadblock_tile, unroll, warp_shape, @@ -367,12 +369,18 @@ def main(output_dir: str): warp_threadsM, warp_threadsN, warps_per_tb, - bench_level) + bench_level, + args.sm_arch) num_benches += 1 print("Benchmarks per level = {}, {}, {}".format(benchcount_L0, benchcount_L1, benchcount_L2)) print("Total bench count per semi-ring = {}".format(num_benches // len(semiring_operators))) - if __name__ == "__main__": - main(".") + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--output-dir", type=str, required=False, default=".", + help="Path to the output dir.") + parser.add_argument("-sm", "--sm-arch", type=int, required=False, default=50, choices=[50, 80], + help="SM architecture version number,") + args = parser.parse_args(sys.argv[1:]) + main(args) diff --git a/bench/device/harness.h b/bench/device/harness.h index c85361b..4386d0a 100644 --- a/bench/device/harness.h +++ b/bench/device/harness.h @@ -1,3 +1,34 @@ +/*************************************************************************************************** + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ #pragma once #include "cutlass/util/distribution.h" @@ -156,8 +187,8 @@ class BenchHarness { // Runs one loop of the benchmark on initialized tensors auto run(int split_k_slices = 1, - ElementCompute alpha = ElementCompute(Srgemm::MultiplicationOp::Identity), - ElementCompute beta = ElementCompute(Srgemm::MultiplicationOp::Identity)) + ElementCompute alpha = ElementCompute(Srgemm::RingOp::MultIdentity), + ElementCompute beta = ElementCompute(Srgemm::RingOp::MultIdentity)) -> cutlass::Status { // Initialize the GEMM operator typename Srgemm::Arguments arguments { diff --git a/bench/device/simt_binary_or_binary_and_ssrgemm_nn_n_sm50.cu b/bench/device/simt_binary_or_binary_and_ssrgemm_nn_n_sm50.cu deleted file mode 100644 index 9da2e71..0000000 --- a/bench/device/simt_binary_or_binary_and_ssrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_binary_or_binary_and_ssrgemm_nn_t_sm50.cu b/bench/device/simt_binary_or_binary_and_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index d144940..0000000 --- a/bench/device/simt_binary_or_binary_and_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_binary_or_binary_and_ssrgemm_nt_n_sm50.cu b/bench/device/simt_binary_or_binary_and_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index e2b3ec8..0000000 --- a/bench/device/simt_binary_or_binary_and_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_binary_or_binary_and_ssrgemm_nt_t_sm50.cu b/bench/device/simt_binary_or_binary_and_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index 4a24d59..0000000 --- a/bench/device/simt_binary_or_binary_and_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_binary_or_binary_and_ssrgemm_tn_n_sm50.cu b/bench/device/simt_binary_or_binary_and_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index 7cf2777..0000000 --- a/bench/device/simt_binary_or_binary_and_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_binary_or_binary_and_ssrgemm_tn_t_sm50.cu b/bench/device/simt_binary_or_binary_and_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index e98898d..0000000 --- a/bench/device/simt_binary_or_binary_and_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_binary_or_binary_and_ssrgemm_tt_n_sm50.cu b/bench/device/simt_binary_or_binary_and_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 48e8bcb..0000000 --- a/bench/device/simt_binary_or_binary_and_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_binary_or_binary_and_ssrgemm_tt_t_sm50.cu b/bench/device/simt_binary_or_binary_and_ssrgemm_tt_t_sm50.cu deleted file mode 100644 index b2ece1d..0000000 --- a/bench/device/simt_binary_or_binary_and_ssrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_binary_or_binary_and_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_minimum_dsrgemm_nt_n_sm50.cu b/bench/device/simt_maximum_minimum_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index de5e233..0000000 --- a/bench/device/simt_maximum_minimum_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_minimum_dsrgemm_tn_n_sm50.cu b/bench/device/simt_maximum_minimum_dsrgemm_tn_n_sm50.cu deleted file mode 100644 index 0087de9..0000000 --- a/bench/device/simt_maximum_minimum_dsrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_minimum_ssrgemm_nn_t_sm50.cu b/bench/device/simt_maximum_minimum_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index c73ae69..0000000 --- a/bench/device/simt_maximum_minimum_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_minimum_ssrgemm_nt_n_sm50.cu b/bench/device/simt_maximum_minimum_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index 2807465..0000000 --- a/bench/device/simt_maximum_minimum_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_minimum_ssrgemm_nt_t_sm50.cu b/bench/device/simt_maximum_minimum_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index 2127978..0000000 --- a/bench/device/simt_maximum_minimum_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_minimum_ssrgemm_tn_n_sm50.cu b/bench/device/simt_maximum_minimum_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index ab4bf33..0000000 --- a/bench/device/simt_maximum_minimum_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_minimum_ssrgemm_tn_t_sm50.cu b/bench/device/simt_maximum_minimum_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 428979a..0000000 --- a/bench/device/simt_maximum_minimum_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_minimum_ssrgemm_tt_n_sm50.cu b/bench/device/simt_maximum_minimum_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 2a27f44..0000000 --- a/bench/device/simt_maximum_minimum_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_dsrgemm_nn_t_sm50.cu b/bench/device/simt_maximum_multiplies_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index f6cb9f2..0000000 --- a/bench/device/simt_maximum_multiplies_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_dsrgemm_nt_t_sm50.cu b/bench/device/simt_maximum_multiplies_dsrgemm_nt_t_sm50.cu deleted file mode 100644 index 71f2a55..0000000 --- a/bench/device/simt_maximum_multiplies_dsrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_dsrgemm_tn_n_sm50.cu b/bench/device/simt_maximum_multiplies_dsrgemm_tn_n_sm50.cu deleted file mode 100644 index bedb8cb..0000000 --- a/bench/device/simt_maximum_multiplies_dsrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_dsrgemm_tn_t_sm50.cu b/bench/device/simt_maximum_multiplies_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index 9201e57..0000000 --- a/bench/device/simt_maximum_multiplies_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_ssrgemm_nn_n_sm50.cu b/bench/device/simt_maximum_multiplies_ssrgemm_nn_n_sm50.cu deleted file mode 100644 index 678313c..0000000 --- a/bench/device/simt_maximum_multiplies_ssrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_ssrgemm_nn_t_sm50.cu b/bench/device/simt_maximum_multiplies_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index 0730ca4..0000000 --- a/bench/device/simt_maximum_multiplies_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_ssrgemm_nt_n_sm50.cu b/bench/device/simt_maximum_multiplies_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index 205990d..0000000 --- a/bench/device/simt_maximum_multiplies_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_ssrgemm_nt_t_sm50.cu b/bench/device/simt_maximum_multiplies_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index 1965e89..0000000 --- a/bench/device/simt_maximum_multiplies_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_ssrgemm_tn_n_sm50.cu b/bench/device/simt_maximum_multiplies_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index e2b3445..0000000 --- a/bench/device/simt_maximum_multiplies_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_ssrgemm_tn_t_sm50.cu b/bench/device/simt_maximum_multiplies_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 05e2794..0000000 --- a/bench/device/simt_maximum_multiplies_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_ssrgemm_tt_n_sm50.cu b/bench/device/simt_maximum_multiplies_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index af4b94a..0000000 --- a/bench/device/simt_maximum_multiplies_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_maximum_multiplies_ssrgemm_tt_t_sm50.cu b/bench/device/simt_maximum_multiplies_ssrgemm_tt_t_sm50.cu deleted file mode 100644 index e014bac..0000000 --- a/bench/device/simt_maximum_multiplies_ssrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_maximum_multiplies_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_dsrgemm_nn_n_sm50.cu b/bench/device/simt_minimum_maximum_dsrgemm_nn_n_sm50.cu deleted file mode 100644 index 070c2c6..0000000 --- a/bench/device/simt_minimum_maximum_dsrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_dsrgemm_nt_n_sm50.cu b/bench/device/simt_minimum_maximum_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index 5412493..0000000 --- a/bench/device/simt_minimum_maximum_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_dsrgemm_tt_n_sm50.cu b/bench/device/simt_minimum_maximum_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index 6c5950c..0000000 --- a/bench/device/simt_minimum_maximum_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_dsrgemm_tt_t_sm50.cu b/bench/device/simt_minimum_maximum_dsrgemm_tt_t_sm50.cu deleted file mode 100644 index ddeb7ee..0000000 --- a/bench/device/simt_minimum_maximum_dsrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_ssrgemm_nn_t_sm50.cu b/bench/device/simt_minimum_maximum_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index f71960f..0000000 --- a/bench/device/simt_minimum_maximum_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_ssrgemm_nt_n_sm50.cu b/bench/device/simt_minimum_maximum_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index f05a891..0000000 --- a/bench/device/simt_minimum_maximum_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_ssrgemm_nt_t_sm50.cu b/bench/device/simt_minimum_maximum_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index 9e59aea..0000000 --- a/bench/device/simt_minimum_maximum_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_ssrgemm_tn_n_sm50.cu b/bench/device/simt_minimum_maximum_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index b5e1dd0..0000000 --- a/bench/device/simt_minimum_maximum_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_ssrgemm_tn_t_sm50.cu b/bench/device/simt_minimum_maximum_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 00ad4c6..0000000 --- a/bench/device/simt_minimum_maximum_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_maximum_ssrgemm_tt_n_sm50.cu b/bench/device/simt_minimum_maximum_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 7676bf6..0000000 --- a/bench/device/simt_minimum_maximum_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_dsrgemm_nn_n_sm50.cu b/bench/device/simt_minimum_multiplies_dsrgemm_nn_n_sm50.cu deleted file mode 100644 index 625baad..0000000 --- a/bench/device/simt_minimum_multiplies_dsrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_dsrgemm_nn_t_sm50.cu b/bench/device/simt_minimum_multiplies_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index 6f0dfef..0000000 --- a/bench/device/simt_minimum_multiplies_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_dsrgemm_tt_n_sm50.cu b/bench/device/simt_minimum_multiplies_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index c5b8c24..0000000 --- a/bench/device/simt_minimum_multiplies_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_dsrgemm_tt_t_sm50.cu b/bench/device/simt_minimum_multiplies_dsrgemm_tt_t_sm50.cu deleted file mode 100644 index 5445238..0000000 --- a/bench/device/simt_minimum_multiplies_dsrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_ssrgemm_nn_n_sm50.cu b/bench/device/simt_minimum_multiplies_ssrgemm_nn_n_sm50.cu deleted file mode 100644 index e1c3432..0000000 --- a/bench/device/simt_minimum_multiplies_ssrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_ssrgemm_nn_t_sm50.cu b/bench/device/simt_minimum_multiplies_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index 4e98030..0000000 --- a/bench/device/simt_minimum_multiplies_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_ssrgemm_nt_n_sm50.cu b/bench/device/simt_minimum_multiplies_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index e4adac9..0000000 --- a/bench/device/simt_minimum_multiplies_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_ssrgemm_nt_t_sm50.cu b/bench/device/simt_minimum_multiplies_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index 37620e0..0000000 --- a/bench/device/simt_minimum_multiplies_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_ssrgemm_tn_n_sm50.cu b/bench/device/simt_minimum_multiplies_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index 04b377b..0000000 --- a/bench/device/simt_minimum_multiplies_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_ssrgemm_tn_t_sm50.cu b/bench/device/simt_minimum_multiplies_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 41700e3..0000000 --- a/bench/device/simt_minimum_multiplies_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_ssrgemm_tt_n_sm50.cu b/bench/device/simt_minimum_multiplies_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index e65807a..0000000 --- a/bench/device/simt_minimum_multiplies_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_multiplies_ssrgemm_tt_t_sm50.cu b/bench/device/simt_minimum_multiplies_ssrgemm_tt_t_sm50.cu deleted file mode 100644 index b23ec39..0000000 --- a/bench/device/simt_minimum_multiplies_ssrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_multiplies_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_plus_dsrgemm_nt_n_sm50.cu b/bench/device/simt_minimum_plus_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index 783bd30..0000000 --- a/bench/device/simt_minimum_plus_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_plus_dsrgemm_tn_t_sm50.cu b/bench/device/simt_minimum_plus_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index 80710e7..0000000 --- a/bench/device/simt_minimum_plus_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_minimum_plus_dsrgemm_tt_n_sm50.cu b/bench/device/simt_minimum_plus_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index 3984e0c..0000000 --- a/bench/device/simt_minimum_plus_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_dsrgemm_nn_n_sm50.cu b/bench/device/simt_plus_multiplies_dsrgemm_nn_n_sm50.cu deleted file mode 100644 index b84e746..0000000 --- a/bench/device/simt_plus_multiplies_dsrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_dsrgemm_nn_t_sm50.cu b/bench/device/simt_plus_multiplies_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index ca2fe72..0000000 --- a/bench/device/simt_plus_multiplies_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_dsrgemm_nt_n_sm50.cu b/bench/device/simt_plus_multiplies_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index f9f4901..0000000 --- a/bench/device/simt_plus_multiplies_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_dsrgemm_nt_t_sm50.cu b/bench/device/simt_plus_multiplies_dsrgemm_nt_t_sm50.cu deleted file mode 100644 index 19a0ba9..0000000 --- a/bench/device/simt_plus_multiplies_dsrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,1815 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_ssrgemm_nn_t_sm50.cu b/bench/device/simt_plus_multiplies_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index 1ad881f..0000000 --- a/bench/device/simt_plus_multiplies_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_ssrgemm_nt_n_sm50.cu b/bench/device/simt_plus_multiplies_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index f1da382..0000000 --- a/bench/device/simt_plus_multiplies_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_ssrgemm_nt_t_sm50.cu b/bench/device/simt_plus_multiplies_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index eece74e..0000000 --- a/bench/device/simt_plus_multiplies_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_ssrgemm_tn_n_sm50.cu b/bench/device/simt_plus_multiplies_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index 31b1e5d..0000000 --- a/bench/device/simt_plus_multiplies_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_ssrgemm_tn_t_sm50.cu b/bench/device/simt_plus_multiplies_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 5c0fda3..0000000 --- a/bench/device/simt_plus_multiplies_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/simt_plus_multiplies_ssrgemm_tt_n_sm50.cu b/bench/device/simt_plus_multiplies_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 07fa1aa..0000000 --- a/bench/device/simt_plus_multiplies_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2865 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -////////////////////////////////////////////////////////////////////// -// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -////////////////////////////////////////////////////////////////////// - -#include "benchmark/benchmark.h" - -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -#include "harness.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { - const auto N = static_cast(state.range(0)); - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - // setup bench harness - cuasr::bench::device::BenchHarness bench({ N, N, N }); - - // benchmark loop - for (auto _ : state) { - benchmark::DoNotOptimize(bench.run()); - cudaDeviceSynchronize(); - } - - double flops_per_itr = 2.0 * N * N * N; - state.counters["Flop/s"] - = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); -} -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) - ->RangeMultiplier(2)->Range(256, 4096); -#endif - diff --git a/bench/device/sm50_defaults.cu b/bench/device/sm50_defaults.cu new file mode 100644 index 0000000..903aba8 --- /dev/null +++ b/bench/device/sm50_defaults.cu @@ -0,0 +1,6158 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_plus_mult_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_plus_mult_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_plus_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_plus_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_plus_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_plus_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_max_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_max_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_min_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_min_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_min_mult_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_min_mult_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_max_mult_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_max_mult_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM50_default_or_and_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_default_or_and_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); diff --git a/bench/device/simt_plus_multiplies_ssrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_max_min_f32_srgemm_nn_n.cu similarity index 76% rename from bench/device/simt_plus_multiplies_ssrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_max_min_f32_srgemm_nn_n.cu index d6d2dbb..9b39ccc 100644 --- a/bench/device/simt_plus_multiplies_ssrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_max_min_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2 // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_ // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4 // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_ // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4 // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_ // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_min_f32_srgemm_nn_t.cu b/bench/device/sm50_simt_max_min_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..0111535 --- /dev/null +++ b/bench/device/sm50_simt_max_min_f32_srgemm_nn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_min_f32_srgemm_nt_n.cu b/bench/device/sm50_simt_max_min_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..4e71ce1 --- /dev/null +++ b/bench/device/sm50_simt_max_min_f32_srgemm_nt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_min_f32_srgemm_nt_t.cu b/bench/device/sm50_simt_max_min_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..d98c2db --- /dev/null +++ b/bench/device/sm50_simt_max_min_f32_srgemm_nt_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_min_f32_srgemm_tn_n.cu b/bench/device/sm50_simt_max_min_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..6b63532 --- /dev/null +++ b/bench/device/sm50_simt_max_min_f32_srgemm_tn_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_min_f32_srgemm_tn_t.cu b/bench/device/sm50_simt_max_min_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..ae956e7 --- /dev/null +++ b/bench/device/sm50_simt_max_min_f32_srgemm_tn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_minimum_plus_ssrgemm_tn_t_sm50.cu b/bench/device/sm50_simt_max_min_f32_srgemm_tt_n.cu similarity index 77% rename from bench/device/simt_minimum_plus_ssrgemm_tn_t_sm50.cu rename to bench/device/sm50_simt_max_min_f32_srgemm_tt_n.cu index a8481af..503ac4a 100644 --- a/bench/device/simt_minimum_plus_ssrgemm_tn_t_sm50.cu +++ b/bench/device/sm50_simt_max_min_f32_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1293,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1342,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,30 +1390,29 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1587,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1636,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1685,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,29 +1881,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,29 +1930,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,29 +1979,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,29 +2028,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,29 +2077,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,30 +2125,29 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,29 +2175,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,29 +2224,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,29 +2273,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,29 +2322,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,29 +2371,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,29 +2420,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,29 +2469,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,29 +2518,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,29 +2567,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,29 +2616,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,29 +2665,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,29 +2714,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,29 +2812,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_plus_multiplies_ssrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_max_min_f32_srgemm_tt_t.cu similarity index 76% rename from bench/device/simt_plus_multiplies_ssrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_max_min_f32_srgemm_tt_t.cu index 8631856..6b461d4 100644 --- a/bench/device/simt_plus_multiplies_ssrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_max_min_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2 // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_ // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4 // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_ // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4 // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_ // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_minimum_dsrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_max_min_f64_srgemm_nn_n.cu similarity index 69% rename from bench/device/simt_maximum_minimum_dsrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_max_min_f64_srgemm_nn_n.cu index 578387d..816090f 100644 --- a/bench/device/simt_maximum_minimum_dsrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_max_min_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_multiplies_dsrgemm_nt_n_sm50.cu b/bench/device/sm50_simt_max_min_f64_srgemm_nn_t.cu similarity index 69% rename from bench/device/simt_minimum_multiplies_dsrgemm_nt_n_sm50.cu rename to bench/device/sm50_simt_max_min_f64_srgemm_nn_t.cu index 44cc104..4c7b620 100644 --- a/bench/device/simt_minimum_multiplies_dsrgemm_nt_n_sm50.cu +++ b/bench/device/sm50_simt_max_min_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_ // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_ // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_ // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x16_32x8x1_4x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_binary_or_binary_and_dsrgemm_tn_n_sm50.cu b/bench/device/sm50_simt_max_min_f64_srgemm_nt_n.cu similarity index 69% rename from bench/device/simt_binary_or_binary_and_dsrgemm_tn_n_sm50.cu rename to bench/device/sm50_simt_max_min_f64_srgemm_nt_n.cu index 30bb6b7..f481cf8 100644 --- a/bench/device/simt_binary_or_binary_and_dsrgemm_tn_n_sm50.cu +++ b/bench/device/sm50_simt_max_min_f64_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,28 +19,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -59,7 +58,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,28 +68,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -109,7 +107,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,28 +117,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -159,7 +156,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_16x64x1_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,28 +166,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,7 +205,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,28 +215,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -259,7 +254,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,28 +264,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -309,7 +303,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,28 +313,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -359,7 +352,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,28 +362,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -409,7 +401,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,28 +411,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -459,7 +450,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,28 +460,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x128x8_16x64x1_4x8_ // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -509,7 +499,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,28 +509,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -559,7 +548,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,28 +558,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -609,7 +597,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,28 +607,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -659,7 +646,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,28 +656,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -709,7 +695,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,28 +705,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -759,7 +744,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,28 +754,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -809,7 +793,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,28 +803,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -859,7 +842,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,28 +852,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -909,7 +891,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,28 +901,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x8_16x64x1_4x8_ // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -959,7 +940,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,28 +950,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,28 +999,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x8_64x16x1_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,28 +1048,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x8_64x16x1_8x4_ // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,28 +1097,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,28 +1146,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_16x128x16_8x32x1_2x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x8x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,28 +1195,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,28 +1244,76 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1309,7 +1332,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x8_16x32x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,28 +1342,76 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x8_16x32x1_4x4_ // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,28 +1440,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,28 +1489,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,28 +1538,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,28 +1587,125 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1559,7 +1724,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x8_32x16x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,28 +1734,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x8_32x16x1_4x4_ // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,28 +1783,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,28 +1832,27 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_32x128x16_8x32x1_2x4_ // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x16_16x8x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,28 +1881,76 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,28 +1979,76 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x16_32x8x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_dsrgemm_nt_t_sm50.cu b/bench/device/sm50_simt_max_min_f64_srgemm_nt_t.cu similarity index 69% rename from bench/device/simt_minimum_plus_dsrgemm_nt_t_sm50.cu rename to bench/device/sm50_simt_max_min_f64_srgemm_nt_t.cu index 9d9a901..9c679ad 100644 --- a/bench/device/simt_minimum_plus_dsrgemm_nt_t_sm50.cu +++ b/bench/device/sm50_simt_max_min_f64_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_maximum_dsrgemm_tn_n_sm50.cu b/bench/device/sm50_simt_max_min_f64_srgemm_tn_n.cu similarity index 69% rename from bench/device/simt_minimum_maximum_dsrgemm_tn_n_sm50.cu rename to bench/device/sm50_simt_max_min_f64_srgemm_tn_n.cu index 1b98cb3..3904631 100644 --- a/bench/device/simt_minimum_maximum_dsrgemm_tn_n_sm50.cu +++ b/bench/device/sm50_simt_max_min_f64_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_multiplies_dsrgemm_tt_n_sm50.cu b/bench/device/sm50_simt_max_min_f64_srgemm_tn_t.cu similarity index 69% rename from bench/device/simt_maximum_multiplies_dsrgemm_tt_n_sm50.cu rename to bench/device/sm50_simt_max_min_f64_srgemm_tn_t.cu index 6b247c9..c6b0084 100644 --- a/bench/device/simt_maximum_multiplies_dsrgemm_tt_n_sm50.cu +++ b/bench/device/sm50_simt_max_min_f64_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_ // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_ // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_ // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x16_32x8x1_4x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_plus_multiplies_dsrgemm_tn_t_sm50.cu b/bench/device/sm50_simt_max_min_f64_srgemm_tt_n.cu similarity index 69% rename from bench/device/simt_plus_multiplies_dsrgemm_tn_t_sm50.cu rename to bench/device/sm50_simt_max_min_f64_srgemm_tt_n.cu index cd96536..a270cb0 100644 --- a/bench/device/simt_plus_multiplies_dsrgemm_tn_t_sm50.cu +++ b/bench/device/sm50_simt_max_min_f64_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_dsrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_max_min_f64_srgemm_tt_t.cu similarity index 69% rename from bench/device/simt_maximum_plus_dsrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_max_min_f64_srgemm_tt_t.cu index 120e463..0f4ee44 100644 --- a/bench/device/simt_maximum_plus_dsrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_max_min_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_min_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_min_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_min_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_min_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_min_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_minimum_ssrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_max_mult_f32_srgemm_nn_n.cu similarity index 76% rename from bench/device/simt_maximum_minimum_ssrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_max_mult_f32_srgemm_nn_n.cu index 7812c2c..c74f8b0 100644 --- a/bench/device/simt_maximum_minimum_ssrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_max_mult_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2 // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_ // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4 // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_ // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4 // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_ // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_mult_f32_srgemm_nn_t.cu b/bench/device/sm50_simt_max_mult_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..1c7feaa --- /dev/null +++ b/bench/device/sm50_simt_max_mult_f32_srgemm_nn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_mult_f32_srgemm_nt_n.cu b/bench/device/sm50_simt_max_mult_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..a100d78 --- /dev/null +++ b/bench/device/sm50_simt_max_mult_f32_srgemm_nt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_minimum_plus_ssrgemm_nt_t_sm50.cu b/bench/device/sm50_simt_max_mult_f32_srgemm_nt_t.cu similarity index 77% rename from bench/device/simt_minimum_plus_ssrgemm_nt_t_sm50.cu rename to bench/device/sm50_simt_max_mult_f32_srgemm_nt_t.cu index 0d9dfc4..78211d4 100644 --- a/bench/device/simt_minimum_plus_ssrgemm_nt_t_sm50.cu +++ b/bench/device/sm50_simt_max_mult_f32_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_ssrgemm_tn_n_sm50.cu b/bench/device/sm50_simt_max_mult_f32_srgemm_tn_n.cu similarity index 77% rename from bench/device/simt_minimum_plus_ssrgemm_tn_n_sm50.cu rename to bench/device/sm50_simt_max_mult_f32_srgemm_tn_n.cu index 9ca0344..8a1a1ca 100644 --- a/bench/device/simt_minimum_plus_ssrgemm_tn_n_sm50.cu +++ b/bench/device/sm50_simt_max_mult_f32_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_mult_f32_srgemm_tn_t.cu b/bench/device/sm50_simt_max_mult_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..7418faa --- /dev/null +++ b/bench/device/sm50_simt_max_mult_f32_srgemm_tn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_mult_f32_srgemm_tt_n.cu b/bench/device/sm50_simt_max_mult_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..f020d15 --- /dev/null +++ b/bench/device/sm50_simt_max_mult_f32_srgemm_tt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_maximum_plus_ssrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_max_mult_f32_srgemm_tt_t.cu similarity index 76% rename from bench/device/simt_maximum_plus_ssrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_max_mult_f32_srgemm_tt_t.cu index af15645..4bcb53d 100644 --- a/bench/device/simt_maximum_plus_ssrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_max_mult_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_dsrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_max_mult_f64_srgemm_nn_n.cu similarity index 69% rename from bench/device/simt_maximum_plus_dsrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_max_mult_f64_srgemm_nn_n.cu index 9afe221..900e93c 100644 --- a/bench/device/simt_maximum_plus_dsrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_max_mult_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_mult_f64_srgemm_nn_t.cu b/bench/device/sm50_simt_max_mult_f64_srgemm_nn_t.cu new file mode 100644 index 0000000..137ae7b --- /dev/null +++ b/bench/device/sm50_simt_max_mult_f64_srgemm_nn_t.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_mult_f64_srgemm_nt_n.cu b/bench/device/sm50_simt_max_mult_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..c0dfb3f --- /dev/null +++ b/bench/device/sm50_simt_max_mult_f64_srgemm_nt_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_mult_f64_srgemm_nt_t.cu b/bench/device/sm50_simt_max_mult_f64_srgemm_nt_t.cu new file mode 100644 index 0000000..3166b3e --- /dev/null +++ b/bench/device/sm50_simt_max_mult_f64_srgemm_nt_t.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_mult_f64_srgemm_tn_n.cu b/bench/device/sm50_simt_max_mult_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..76e34b7 --- /dev/null +++ b/bench/device/sm50_simt_max_mult_f64_srgemm_tn_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_maximum_minimum_dsrgemm_nt_t_sm50.cu b/bench/device/sm50_simt_max_mult_f64_srgemm_tn_t.cu similarity index 69% rename from bench/device/simt_maximum_minimum_dsrgemm_nt_t_sm50.cu rename to bench/device/sm50_simt_max_mult_f64_srgemm_tn_t.cu index 945aee0..4661909 100644 --- a/bench/device/simt_maximum_minimum_dsrgemm_nt_t_sm50.cu +++ b/bench/device/sm50_simt_max_mult_f64_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,28 +19,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,28 +68,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,28 +117,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,28 +166,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,28 +215,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,28 +264,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,28 +313,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,28 +362,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,28 +411,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,28 +460,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,28 +509,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,28 +558,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,28 +607,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,28 +656,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,28 +705,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,28 +754,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,28 +803,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,28 +852,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,28 +901,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,28 +950,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,28 +999,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,28 +1048,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,28 +1097,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,28 +1146,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,28 +1195,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,28 +1244,76 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1309,7 +1332,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,28 +1342,76 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,28 +1440,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,28 +1489,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,28 +1538,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,28 +1587,125 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1559,7 +1724,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,28 +1734,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,28 +1783,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,28 +1832,27 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,28 +1881,76 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,28 +1979,76 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_minimum_dsrgemm_tn_t_sm50.cu b/bench/device/sm50_simt_max_mult_f64_srgemm_tt_n.cu similarity index 69% rename from bench/device/simt_maximum_minimum_dsrgemm_tn_t_sm50.cu rename to bench/device/sm50_simt_max_mult_f64_srgemm_tt_n.cu index 9f35d41..7e22d18 100644 --- a/bench/device/simt_maximum_minimum_dsrgemm_tn_t_sm50.cu +++ b/bench/device/sm50_simt_max_mult_f64_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_multiplies_dsrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_max_mult_f64_srgemm_tt_t.cu similarity index 69% rename from bench/device/simt_maximum_multiplies_dsrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_max_mult_f64_srgemm_tt_t.cu index 6c62ab9..fde1f52 100644 --- a/bench/device/simt_maximum_multiplies_dsrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_max_mult_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_ // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_ // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_ // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_mult_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x16_32x8x1_4x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_mult_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_maximum_ssrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_max_plus_f32_srgemm_nn_n.cu similarity index 76% rename from bench/device/simt_minimum_maximum_ssrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_max_plus_f32_srgemm_nn_n.cu index 45c7417..452e229 100644 --- a/bench/device/simt_minimum_maximum_ssrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_max_plus_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2 // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_ // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4 // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_ // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4 // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_ // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_plus_f32_srgemm_nn_t.cu b/bench/device/sm50_simt_max_plus_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..fc9adac --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f32_srgemm_nn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_plus_f32_srgemm_nt_n.cu b/bench/device/sm50_simt_max_plus_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..e31224b --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f32_srgemm_nt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_plus_f32_srgemm_nt_t.cu b/bench/device/sm50_simt_max_plus_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..749fb75 --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f32_srgemm_nt_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_maximum_plus_ssrgemm_nn_t_sm50.cu b/bench/device/sm50_simt_max_plus_f32_srgemm_tn_n.cu similarity index 77% rename from bench/device/simt_maximum_plus_ssrgemm_nn_t_sm50.cu rename to bench/device/sm50_simt_max_plus_f32_srgemm_tn_n.cu index 63bbfac..9c46a86 100644 --- a/bench/device/simt_maximum_plus_ssrgemm_nn_t_sm50.cu +++ b/bench/device/sm50_simt_max_plus_f32_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1293,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1342,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,30 +1390,29 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1587,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1636,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1685,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,29 +1881,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,29 +1930,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,29 +1979,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,29 +2028,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,29 +2077,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,30 +2125,29 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,29 +2175,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,29 +2224,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,29 +2273,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,29 +2322,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,29 +2371,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,29 +2420,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,29 +2469,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,29 +2518,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,29 +2567,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,29 +2616,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,29 +2665,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,29 +2714,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,29 +2812,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_plus_f32_srgemm_tn_t.cu b/bench/device/sm50_simt_max_plus_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..06563c5 --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f32_srgemm_tn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_plus_f32_srgemm_tt_n.cu b/bench/device/sm50_simt_max_plus_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..43e7a63 --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f32_srgemm_tt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_minimum_maximum_ssrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_max_plus_f32_srgemm_tt_t.cu similarity index 76% rename from bench/device/simt_minimum_maximum_ssrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_max_plus_f32_srgemm_tt_t.cu index 01f8575..c3195e9 100644 --- a/bench/device/simt_minimum_maximum_ssrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_max_plus_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2 // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_ // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4 // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_ // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4 // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_ // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_plus_f64_srgemm_nn_n.cu b/bench/device/sm50_simt_max_plus_f64_srgemm_nn_n.cu new file mode 100644 index 0000000..8e9ccc2 --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f64_srgemm_nn_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_minimum_maximum_dsrgemm_nn_t_sm50.cu b/bench/device/sm50_simt_max_plus_f64_srgemm_nn_t.cu similarity index 69% rename from bench/device/simt_minimum_maximum_dsrgemm_nn_t_sm50.cu rename to bench/device/sm50_simt_max_plus_f64_srgemm_nn_t.cu index e6d2239..58d273e 100644 --- a/bench/device/simt_minimum_maximum_dsrgemm_nn_t_sm50.cu +++ b/bench/device/sm50_simt_max_plus_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_plus_f64_srgemm_nt_n.cu b/bench/device/sm50_simt_max_plus_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..f1de251 --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f64_srgemm_nt_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_plus_f64_srgemm_nt_t.cu b/bench/device/sm50_simt_max_plus_f64_srgemm_nt_t.cu new file mode 100644 index 0000000..a0517f6 --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f64_srgemm_nt_t.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_plus_f64_srgemm_tn_n.cu b/bench/device/sm50_simt_max_plus_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..1962acb --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f64_srgemm_tn_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_minimum_multiplies_dsrgemm_nt_t_sm50.cu b/bench/device/sm50_simt_max_plus_f64_srgemm_tn_t.cu similarity index 69% rename from bench/device/simt_minimum_multiplies_dsrgemm_nt_t_sm50.cu rename to bench/device/sm50_simt_max_plus_f64_srgemm_tn_t.cu index a0adc1d..c30c8ff 100644 --- a/bench/device/simt_minimum_multiplies_dsrgemm_nt_t_sm50.cu +++ b/bench/device/sm50_simt_max_plus_f64_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,28 +19,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,28 +68,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,28 +117,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,28 +166,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,28 +215,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,28 +264,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,28 +313,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,28 +362,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,28 +411,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,28 +460,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,28 +509,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,28 +558,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,28 +607,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,28 +656,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,28 +705,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_ // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,28 +754,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_ // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,28 +803,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,28 +852,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,28 +901,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,28 +950,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,28 +999,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x8_64x16x1_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,28 +1048,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,28 +1097,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,28 +1146,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,28 +1195,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,28 +1244,76 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1309,7 +1332,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x8_16x32x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,28 +1342,76 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,28 +1440,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,28 +1489,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_ // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,28 +1538,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,28 +1587,125 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1559,7 +1724,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x8_32x16x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,28 +1734,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,28 +1783,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,28 +1832,27 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,28 +1881,76 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,28 +1979,76 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x16_32x8x1_4x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_max_plus_f64_srgemm_tt_n.cu b/bench/device/sm50_simt_max_plus_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..fdad127 --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f64_srgemm_tt_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_max_plus_f64_srgemm_tt_t.cu b/bench/device/sm50_simt_max_plus_f64_srgemm_tt_t.cu new file mode 100644 index 0000000..e0ef62e --- /dev/null +++ b/bench/device/sm50_simt_max_plus_f64_srgemm_tt_t.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_max_plus_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_max_plus_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_max_f32_srgemm_nn_n.cu b/bench/device/sm50_simt_min_max_f32_srgemm_nn_n.cu new file mode 100644 index 0000000..144c928 --- /dev/null +++ b/bench/device/sm50_simt_min_max_f32_srgemm_nn_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_max_f32_srgemm_nn_t.cu b/bench/device/sm50_simt_min_max_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..18d7a3b --- /dev/null +++ b/bench/device/sm50_simt_min_max_f32_srgemm_nn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_max_f32_srgemm_nt_n.cu b/bench/device/sm50_simt_min_max_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..cdbaf93 --- /dev/null +++ b/bench/device/sm50_simt_min_max_f32_srgemm_nt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_max_f32_srgemm_nt_t.cu b/bench/device/sm50_simt_min_max_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..bf60cb4 --- /dev/null +++ b/bench/device/sm50_simt_min_max_f32_srgemm_nt_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_minimum_plus_ssrgemm_nn_t_sm50.cu b/bench/device/sm50_simt_min_max_f32_srgemm_tn_n.cu similarity index 77% rename from bench/device/simt_minimum_plus_ssrgemm_nn_t_sm50.cu rename to bench/device/sm50_simt_min_max_f32_srgemm_tn_n.cu index 29367c4..fde6125 100644 --- a/bench/device/simt_minimum_plus_ssrgemm_nn_t_sm50.cu +++ b/bench/device/sm50_simt_min_max_f32_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1293,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1342,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,30 +1390,29 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1587,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1636,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1685,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,29 +1881,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,29 +1930,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,29 +1979,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,29 +2028,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,29 +2077,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,30 +2125,29 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,29 +2175,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,29 +2224,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,29 +2273,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,29 +2322,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,29 +2371,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,29 +2420,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,29 +2469,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,29 +2518,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,29 +2567,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,29 +2616,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,29 +2665,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,29 +2714,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,29 +2812,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_min_max_f32_srgemm_tn_t.cu b/bench/device/sm50_simt_min_max_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..7b7a21d --- /dev/null +++ b/bench/device/sm50_simt_min_max_f32_srgemm_tn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_max_f32_srgemm_tt_n.cu b/bench/device/sm50_simt_min_max_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..291c191 --- /dev/null +++ b/bench/device/sm50_simt_min_max_f32_srgemm_tt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_max_f32_srgemm_tt_t.cu b/bench/device/sm50_simt_min_max_f32_srgemm_tt_t.cu new file mode 100644 index 0000000..ed17260 --- /dev/null +++ b/bench/device/sm50_simt_min_max_f32_srgemm_tt_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_maximum_multiplies_dsrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_min_max_f64_srgemm_nn_n.cu similarity index 69% rename from bench/device/simt_maximum_multiplies_dsrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_min_max_f64_srgemm_nn_n.cu index 9591333..b8f3d41 100644 --- a/bench/device/simt_maximum_multiplies_dsrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_min_max_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_ // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_ // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x8_64x16x1_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x8_16x32x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_ // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x8_32x16x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x16_32x8x1_4x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_dsrgemm_tn_n_sm50.cu b/bench/device/sm50_simt_min_max_f64_srgemm_nn_t.cu similarity index 69% rename from bench/device/simt_maximum_plus_dsrgemm_tn_n_sm50.cu rename to bench/device/sm50_simt_min_max_f64_srgemm_nn_t.cu index 2cecd50..3b102ec 100644 --- a/bench/device/simt_maximum_plus_dsrgemm_tn_n_sm50.cu +++ b/bench/device/sm50_simt_min_max_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_binary_or_binary_and_dsrgemm_nt_n_sm50.cu b/bench/device/sm50_simt_min_max_f64_srgemm_nt_n.cu similarity index 69% rename from bench/device/simt_binary_or_binary_and_dsrgemm_nt_n_sm50.cu rename to bench/device/sm50_simt_min_max_f64_srgemm_nt_n.cu index 22f4653..6997fa1 100644 --- a/bench/device/simt_binary_or_binary_and_dsrgemm_nt_n_sm50.cu +++ b/bench/device/sm50_simt_min_max_f64_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_16x64x1_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x128x8_16x64x1_4x8_ // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x8_16x64x1_4x8_ // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x8_64x16x1_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x8_64x16x1_8x4_ // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_16x128x16_8x32x1_2x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x8x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x8_16x32x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x8_16x32x1_4x4_ // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x8_32x16x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x8_32x16x1_4x4_ // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_32x128x16_8x32x1_2x4_ // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x16_16x8x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x16_32x8x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_binary_or_binary_and_dsrgemm_nt_t_sm50.cu b/bench/device/sm50_simt_min_max_f64_srgemm_nt_t.cu similarity index 69% rename from bench/device/simt_binary_or_binary_and_dsrgemm_nt_t_sm50.cu rename to bench/device/sm50_simt_min_max_f64_srgemm_nt_t.cu index b4a6ded..fb7704f 100644 --- a/bench/device/simt_binary_or_binary_and_dsrgemm_nt_t_sm50.cu +++ b/bench/device/sm50_simt_min_max_f64_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_16x64x1_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x128x8_16x64x1_4x8_ // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x8_16x64x1_4x8_ // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x8_64x16x1_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x8_64x16x1_8x4_ // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_16x128x16_8x32x1_2x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x8x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x8_16x32x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x8_16x32x1_4x4_ // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x8_32x16x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x8_32x16x1_4x4_ // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_32x128x16_8x32x1_2x4_ // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x16_16x8x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x16_32x8x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_min_max_f64_srgemm_tn_n.cu b/bench/device/sm50_simt_min_max_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..d784ebc --- /dev/null +++ b/bench/device/sm50_simt_min_max_f64_srgemm_tn_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_minimum_maximum_dsrgemm_nt_t_sm50.cu b/bench/device/sm50_simt_min_max_f64_srgemm_tn_t.cu similarity index 69% rename from bench/device/simt_minimum_maximum_dsrgemm_nt_t_sm50.cu rename to bench/device/sm50_simt_min_max_f64_srgemm_tn_t.cu index c537d86..2a4a5f4 100644 --- a/bench/device/simt_minimum_maximum_dsrgemm_nt_t_sm50.cu +++ b/bench/device/sm50_simt_min_max_f64_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,28 +19,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,28 +68,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,28 +117,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,28 +166,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,28 +215,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,28 +264,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,28 +313,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,28 +362,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,28 +411,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,28 +460,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,28 +509,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,28 +558,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,28 +607,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,28 +656,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,28 +705,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,28 +754,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,28 +803,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,28 +852,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,28 +901,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,28 +950,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,28 +999,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,28 +1048,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,28 +1097,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,28 +1146,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,28 +1195,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,28 +1244,76 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1309,7 +1332,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,28 +1342,76 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,28 +1440,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,28 +1489,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,28 +1538,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,28 +1587,125 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1559,7 +1724,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,28 +1734,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,28 +1783,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,28 +1832,27 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,28 +1881,76 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,28 +1979,76 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_maximum_dsrgemm_tn_t_sm50.cu b/bench/device/sm50_simt_min_max_f64_srgemm_tt_n.cu similarity index 69% rename from bench/device/simt_minimum_maximum_dsrgemm_tn_t_sm50.cu rename to bench/device/sm50_simt_min_max_f64_srgemm_tt_n.cu index 136e5b0..77ff56e 100644 --- a/bench/device/simt_minimum_maximum_dsrgemm_tn_t_sm50.cu +++ b/bench/device/sm50_simt_min_max_f64_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_maximum_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_binary_or_binary_and_dsrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_min_max_f64_srgemm_tt_t.cu similarity index 69% rename from bench/device/simt_binary_or_binary_and_dsrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_min_max_f64_srgemm_tt_t.cu index 4078e77..cf0955f 100644 --- a/bench/device/simt_binary_or_binary_and_dsrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_min_max_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_16x64x1_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x128x8_16x64x1_4x8_ // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x8_16x64x1_4x8_ // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x8_64x16x1_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x8_64x16x1_8x4_ // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_16x128x16_8x32x1_2x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x8x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x8_16x32x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x8_16x32x1_4x4_ // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_max_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x8_32x16x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x8_32x16x1_4x4_ // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_32x128x16_8x32x1_2x4_ // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x16_16x8x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_max_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_max_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_max_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x16_32x8x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_max_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_ssrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_min_mult_f32_srgemm_nn_n.cu similarity index 77% rename from bench/device/simt_maximum_plus_ssrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_min_mult_f32_srgemm_nn_n.cu index 38f0663..824608a 100644 --- a/bench/device/simt_maximum_plus_ssrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_min_mult_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_min_mult_f32_srgemm_nn_t.cu b/bench/device/sm50_simt_min_mult_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..e2d35ef --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f32_srgemm_nn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f32_srgemm_nt_n.cu b/bench/device/sm50_simt_min_mult_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..3af198c --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f32_srgemm_nt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f32_srgemm_nt_t.cu b/bench/device/sm50_simt_min_mult_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..f05f569 --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f32_srgemm_nt_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f32_srgemm_tn_n.cu b/bench/device/sm50_simt_min_mult_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..d410a1f --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f32_srgemm_tn_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f32_srgemm_tn_t.cu b/bench/device/sm50_simt_min_mult_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..badc3b5 --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f32_srgemm_tn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f32_srgemm_tt_n.cu b/bench/device/sm50_simt_min_mult_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..0411f0a --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f32_srgemm_tt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_maximum_minimum_ssrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_min_mult_f32_srgemm_tt_t.cu similarity index 76% rename from bench/device/simt_maximum_minimum_ssrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_min_mult_f32_srgemm_tt_t.cu index ba34829..7339ded 100644 --- a/bench/device/simt_maximum_minimum_ssrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_min_mult_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2 // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_ // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4 // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_ // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4 // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_ // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_min_mult_f64_srgemm_nn_n.cu b/bench/device/sm50_simt_min_mult_f64_srgemm_nn_n.cu new file mode 100644 index 0000000..9fdc791 --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f64_srgemm_nn_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f64_srgemm_nn_t.cu b/bench/device/sm50_simt_min_mult_f64_srgemm_nn_t.cu new file mode 100644 index 0000000..455e686 --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f64_srgemm_nn_t.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f64_srgemm_nt_n.cu b/bench/device/sm50_simt_min_mult_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..33acc88 --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f64_srgemm_nt_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_maximum_minimum_dsrgemm_tt_n_sm50.cu b/bench/device/sm50_simt_min_mult_f64_srgemm_nt_t.cu similarity index 69% rename from bench/device/simt_maximum_minimum_dsrgemm_tt_n_sm50.cu rename to bench/device/sm50_simt_min_mult_f64_srgemm_nt_t.cu index ca868f6..7ed7cc1 100644 --- a/bench/device/simt_maximum_minimum_dsrgemm_tt_n_sm50.cu +++ b/bench/device/sm50_simt_min_mult_f64_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_min_mult_f64_srgemm_tn_n.cu b/bench/device/sm50_simt_min_mult_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..a0109ba --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f64_srgemm_tn_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f64_srgemm_tn_t.cu b/bench/device/sm50_simt_min_mult_f64_srgemm_tn_t.cu new file mode 100644 index 0000000..ce1453c --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f64_srgemm_tn_t.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_mult_f64_srgemm_tt_n.cu b/bench/device/sm50_simt_min_mult_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..84a0956 --- /dev/null +++ b/bench/device/sm50_simt_min_mult_f64_srgemm_tt_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_plus_multiplies_dsrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_min_mult_f64_srgemm_tt_t.cu similarity index 69% rename from bench/device/simt_plus_multiplies_dsrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_min_mult_f64_srgemm_tt_t.cu index 2cb9c1a..cb32374 100644 --- a/bench/device/simt_plus_multiplies_dsrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_min_mult_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_mult_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_mult_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_min_plus_f32_srgemm_nn_n.cu b/bench/device/sm50_simt_min_plus_f32_srgemm_nn_n.cu new file mode 100644 index 0000000..38cae18 --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f32_srgemm_nn_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_plus_f32_srgemm_nn_t.cu b/bench/device/sm50_simt_min_plus_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..45779bb --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f32_srgemm_nn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_plus_f32_srgemm_nt_n.cu b/bench/device/sm50_simt_min_plus_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..7da6cd5 --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f32_srgemm_nt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_maximum_plus_ssrgemm_nt_t_sm50.cu b/bench/device/sm50_simt_min_plus_f32_srgemm_nt_t.cu similarity index 77% rename from bench/device/simt_maximum_plus_ssrgemm_nt_t_sm50.cu rename to bench/device/sm50_simt_min_plus_f32_srgemm_nt_t.cu index d432b83..1a2cdf3 100644 --- a/bench/device/simt_maximum_plus_ssrgemm_nt_t_sm50.cu +++ b/bench/device/sm50_simt_min_plus_f32_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_min_plus_f32_srgemm_tn_n.cu b/bench/device/sm50_simt_min_plus_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..dcf61e2 --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f32_srgemm_tn_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_plus_f32_srgemm_tn_t.cu b/bench/device/sm50_simt_min_plus_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..6a9ad4a --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f32_srgemm_tn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_plus_f32_srgemm_tt_n.cu b/bench/device/sm50_simt_min_plus_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..efa18e3 --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f32_srgemm_tt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_plus_f32_srgemm_tt_t.cu b/bench/device/sm50_simt_min_plus_f32_srgemm_tt_t.cu new file mode 100644 index 0000000..adf4cf4 --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f32_srgemm_tt_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_plus_f64_srgemm_nn_n.cu b/bench/device/sm50_simt_min_plus_f64_srgemm_nn_n.cu new file mode 100644 index 0000000..7c20c15 --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f64_srgemm_nn_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_plus_multiplies_dsrgemm_tn_n_sm50.cu b/bench/device/sm50_simt_min_plus_f64_srgemm_nn_t.cu similarity index 69% rename from bench/device/simt_plus_multiplies_dsrgemm_tn_n_sm50.cu rename to bench/device/sm50_simt_min_plus_f64_srgemm_nn_t.cu index bb34240..03a0b1c 100644 --- a/bench/device/simt_plus_multiplies_dsrgemm_tn_n_sm50.cu +++ b/bench/device/sm50_simt_min_plus_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_multiplies_dsrgemm_nt_n_sm50.cu b/bench/device/sm50_simt_min_plus_f64_srgemm_nt_n.cu similarity index 69% rename from bench/device/simt_maximum_multiplies_dsrgemm_nt_n_sm50.cu rename to bench/device/sm50_simt_min_plus_f64_srgemm_nt_n.cu index 7469cac..25edcab 100644 --- a/bench/device/simt_maximum_multiplies_dsrgemm_nt_n_sm50.cu +++ b/bench/device/sm50_simt_min_plus_f64_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_ // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_ // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_ // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x16_32x8x1_4x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_multiplies_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_multiplies_dsrgemm_tn_t_sm50.cu b/bench/device/sm50_simt_min_plus_f64_srgemm_nt_t.cu similarity index 69% rename from bench/device/simt_minimum_multiplies_dsrgemm_tn_t_sm50.cu rename to bench/device/sm50_simt_min_plus_f64_srgemm_nt_t.cu index c5ca3c9..d4543aa 100644 --- a/bench/device/simt_minimum_multiplies_dsrgemm_tn_t_sm50.cu +++ b/bench/device/sm50_simt_min_plus_f64_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_ // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_ // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_ // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x16_32x8x1_4x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_dsrgemm_tn_n_sm50.cu b/bench/device/sm50_simt_min_plus_f64_srgemm_tn_n.cu similarity index 69% rename from bench/device/simt_minimum_plus_dsrgemm_tn_n_sm50.cu rename to bench/device/sm50_simt_min_plus_f64_srgemm_tn_n.cu index faab463..e16fa2a 100644 --- a/bench/device/simt_minimum_plus_dsrgemm_tn_n_sm50.cu +++ b/bench/device/sm50_simt_min_plus_f64_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm50_simt_min_plus_f64_srgemm_tn_t.cu b/bench/device/sm50_simt_min_plus_f64_srgemm_tn_t.cu new file mode 100644 index 0000000..6aba987 --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f64_srgemm_tn_t.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_plus_f64_srgemm_tt_n.cu b/bench/device/sm50_simt_min_plus_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..f50f746 --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f64_srgemm_tt_n.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_min_plus_f64_srgemm_tt_t.cu b/bench/device/sm50_simt_min_plus_f64_srgemm_tt_t.cu new file mode 100644 index 0000000..a39112c --- /dev/null +++ b/bench/device/sm50_simt_min_plus_f64_srgemm_tt_t.cu @@ -0,0 +1,2073 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_min_plus_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_min_plus_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_or_and_f32_srgemm_nn_n.cu b/bench/device/sm50_simt_or_and_f32_srgemm_nn_n.cu new file mode 100644 index 0000000..6812066 --- /dev/null +++ b/bench/device/sm50_simt_or_and_f32_srgemm_nn_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_or_and_f32_srgemm_nn_t.cu b/bench/device/sm50_simt_or_and_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..fa5253a --- /dev/null +++ b/bench/device/sm50_simt_or_and_f32_srgemm_nn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_or_and_f32_srgemm_nt_n.cu b/bench/device/sm50_simt_or_and_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..2756e0f --- /dev/null +++ b/bench/device/sm50_simt_or_and_f32_srgemm_nt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_or_and_f32_srgemm_nt_t.cu b/bench/device/sm50_simt_or_and_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..34fd53e --- /dev/null +++ b/bench/device/sm50_simt_or_and_f32_srgemm_nt_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_or_and_f32_srgemm_tn_n.cu b/bench/device/sm50_simt_or_and_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..e66ed95 --- /dev/null +++ b/bench/device/sm50_simt_or_and_f32_srgemm_tn_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_or_and_f32_srgemm_tn_t.cu b/bench/device/sm50_simt_or_and_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..b55456d --- /dev/null +++ b/bench/device/sm50_simt_or_and_f32_srgemm_tn_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_or_and_f32_srgemm_tt_n.cu b/bench/device/sm50_simt_or_and_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..168fdc7 --- /dev/null +++ b/bench/device/sm50_simt_or_and_f32_srgemm_tt_n.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/sm50_simt_or_and_f32_srgemm_tt_t.cu b/bench/device/sm50_simt_or_and_f32_srgemm_tt_t.cu new file mode 100644 index 0000000..6811aa7 --- /dev/null +++ b/bench/device/sm50_simt_or_and_f32_srgemm_tt_t.cu @@ -0,0 +1,2906 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + diff --git a/bench/device/simt_binary_or_binary_and_dsrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_or_and_f64_srgemm_nn_n.cu similarity index 69% rename from bench/device/simt_binary_or_binary_and_dsrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_or_and_f64_srgemm_nn_n.cu index 1573e38..6880025 100644 --- a/bench/device/simt_binary_or_binary_and_dsrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_or_and_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_16x64x1_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x128x8_16x64x1_4x8_ // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x8_16x64x1_4x8_ // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x8_64x16x1_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x8_64x16x1_8x4_ // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_16x128x16_8x32x1_2x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x8x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x8_16x32x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x8_16x32x1_4x4_ // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x8_32x16x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x8_32x16x1_4x4_ // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_32x128x16_8x32x1_2x4_ // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x16_16x8x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x16_32x8x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_multiplies_dsrgemm_tn_n_sm50.cu b/bench/device/sm50_simt_or_and_f64_srgemm_nn_t.cu similarity index 69% rename from bench/device/simt_minimum_multiplies_dsrgemm_tn_n_sm50.cu rename to bench/device/sm50_simt_or_and_f64_srgemm_nn_t.cu index 13a2903..4c84aa1 100644 --- a/bench/device/simt_minimum_multiplies_dsrgemm_tn_n_sm50.cu +++ b/bench/device/sm50_simt_or_and_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_16x64x1_4x8_4x8 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_32x32x1_8x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_16x16x1_4x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_16x32x1_4x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x128x8_16x64x1_4x8_4x // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_32x16x1_4x4_8x4 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_ // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_ // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_16x32x1_4x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x8_16x64x1_4x8_4x // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_32x16x1_4x4_8x4 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_32x32x1_8x4_4x8 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x8_64x16x1_8x4_8x // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x64x16_8x16x1_2x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_16x128x16_8x32x1_2x4_4x // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x8_16x16x1_4x2_4x8 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x8_16x32x1_4x4_4x // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_32x16x1_4x4_8x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_ // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_16x32x1_4x4_4x8 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x8_32x16x1_4x4_8x // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x64x16_8x16x1_2x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_32x128x16_8x32x1_2x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x32x16_16x8x1_2x2_8x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_64x64x8_16x16x1_4x2_4x8 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x16_32x8x1_4x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_multiplies_dsrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_binary_or_binary_and_dsrgemm_nn_t_sm50.cu b/bench/device/sm50_simt_or_and_f64_srgemm_nt_n.cu similarity index 69% rename from bench/device/simt_binary_or_binary_and_dsrgemm_nn_t_sm50.cu rename to bench/device/sm50_simt_or_and_f64_srgemm_nt_n.cu index ebf1e47..ff09a86 100644 --- a/bench/device/simt_binary_or_binary_and_dsrgemm_nn_t_sm50.cu +++ b/bench/device/sm50_simt_or_and_f64_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_16x64x1_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x128x8_16x64x1_4x8_ // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x8_16x64x1_4x8_ // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x8_64x16x1_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x8_64x16x1_8x4_ // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_16x128x16_8x32x1_2x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x8x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x8_16x32x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x8_16x32x1_4x4_ // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x8_32x16x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x8_32x16x1_4x4_ // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_32x128x16_8x32x1_2x4_ // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x16_16x8x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x16_32x8x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_binary_or_binary_and_dsrgemm_tt_n_sm50.cu b/bench/device/sm50_simt_or_and_f64_srgemm_nt_t.cu similarity index 69% rename from bench/device/simt_binary_or_binary_and_dsrgemm_tt_n_sm50.cu rename to bench/device/sm50_simt_or_and_f64_srgemm_nt_t.cu index 413790d..54ef31b 100644 --- a/bench/device/simt_binary_or_binary_and_dsrgemm_tt_n_sm50.cu +++ b/bench/device/sm50_simt_or_and_f64_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_16x64x1_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x128x8_16x64x1_4x8_ // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x8_16x64x1_4x8_ // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x8_64x16x1_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x8_64x16x1_8x4_ // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_16x128x16_8x32x1_2x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x8x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x8_16x32x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x8_16x32x1_4x4_ // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x8_32x16x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x8_32x16x1_4x4_ // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_32x128x16_8x32x1_2x4_ // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x16_16x8x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x16_32x8x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_minimum_dsrgemm_nn_t_sm50.cu b/bench/device/sm50_simt_or_and_f64_srgemm_tn_n.cu similarity index 69% rename from bench/device/simt_maximum_minimum_dsrgemm_nn_t_sm50.cu rename to bench/device/sm50_simt_or_and_f64_srgemm_tn_n.cu index f4ac483..6ad933b 100644 --- a/bench/device/simt_maximum_minimum_dsrgemm_nn_t_sm50.cu +++ b/bench/device/sm50_simt_or_and_f64_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_plus_multiplies_dsrgemm_tt_n_sm50.cu b/bench/device/sm50_simt_or_and_f64_srgemm_tn_t.cu similarity index 69% rename from bench/device/simt_plus_multiplies_dsrgemm_tt_n_sm50.cu rename to bench/device/sm50_simt_or_and_f64_srgemm_tn_t.cu index b20eadb..617f86a 100644 --- a/bench/device/simt_plus_multiplies_dsrgemm_tt_n_sm50.cu +++ b/bench/device/sm50_simt_or_and_f64_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_plus_multiplies_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_binary_or_binary_and_dsrgemm_tn_t_sm50.cu b/bench/device/sm50_simt_or_and_f64_srgemm_tt_n.cu similarity index 69% rename from bench/device/simt_binary_or_binary_and_dsrgemm_tn_t_sm50.cu rename to bench/device/sm50_simt_or_and_f64_srgemm_tt_n.cu index 2e9673e..5e7d10c 100644 --- a/bench/device/simt_binary_or_binary_and_dsrgemm_tn_t_sm50.cu +++ b/bench/device/sm50_simt_or_and_f64_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4 // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_16x64x1_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4 // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4 // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8 // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8 // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4 // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x128x8_16x64x1_4x8_ // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8 // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4 // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_8x32x1_2x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4 // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x8_16x64x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x8_16x64x1_4x8_ // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8 // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_32x32x1_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4 // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x8_64x16x1_8x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x8_64x16x1_8x4_ // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4 // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_16x128x16_8x32x1_2x4_ // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x8x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4 // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x8_16x32x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x8_16x32x1_4x4_ // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_32x16x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_8x16x1_2x2_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4 // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_16x32x1_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4 // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x8_32x16x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x8_32x16x1_4x4_ // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x16_8x16x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x16_8x32x1_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_32x128x16_8x32x1_2x4_ // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x16_16x8x1_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8 // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_16x16x1_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4 // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x16_32x8x1_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_binary_or_binary_and_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_minimum_dsrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_or_and_f64_srgemm_tt_t.cu similarity index 69% rename from bench/device/simt_maximum_minimum_dsrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_or_and_f64_srgemm_tt_t.cu index 1a70c9d..714af29 100644 --- a/bench/device/simt_maximum_minimum_dsrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_or_and_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_or_and_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_or_and_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_ state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_or_and_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_or_and_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_minimum_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_or_and_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_ssrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f32_srgemm_nn_n.cu similarity index 79% rename from bench/device/simt_minimum_plus_ssrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f32_srgemm_nn_n.cu index bc3fafe..9e237c4 100644 --- a/bench/device/simt_minimum_plus_ssrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nn_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_ssrgemm_nt_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f32_srgemm_nn_t.cu similarity index 79% rename from bench/device/simt_minimum_plus_ssrgemm_nt_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f32_srgemm_nn_t.cu index e3d8f84..95ac670 100644 --- a/bench/device/simt_minimum_plus_ssrgemm_nt_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f32_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1293,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1342,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,30 +1390,29 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1587,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1636,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1685,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,29 +1881,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,29 +1930,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,29 +1979,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,29 +2028,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,29 +2077,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,30 +2125,29 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,29 +2175,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,29 +2224,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,29 +2273,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,29 +2322,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,29 +2371,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,29 +2420,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,29 +2469,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,29 +2518,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,29 +2567,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,29 +2616,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,29 +2665,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,29 +2714,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,29 +2812,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nn_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_ssrgemm_tn_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f32_srgemm_nt_n.cu similarity index 79% rename from bench/device/simt_maximum_plus_ssrgemm_tn_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f32_srgemm_nt_n.cu index 770c4be..c89e55f 100644 --- a/bench/device/simt_maximum_plus_ssrgemm_tn_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f32_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,28 +19,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,28 +68,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,28 +117,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,28 +166,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,28 +215,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,28 +264,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,28 +313,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,28 +362,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,28 +411,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,28 +460,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,28 +509,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,28 +558,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,28 +607,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,28 +656,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,28 +705,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,28 +754,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,28 +803,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,28 +852,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,28 +901,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,28 +950,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,28 +999,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,28 +1048,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,28 +1097,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,28 +1146,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,28 +1195,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,28 +1244,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,28 +1293,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,28 +1342,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,29 +1390,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,28 +1440,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,28 +1489,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,28 +1538,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,28 +1587,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,28 +1636,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,28 +1685,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,28 +1734,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,28 +1783,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,28 +1832,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,28 +1881,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,28 +1930,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,28 +1979,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,28 +2028,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,28 +2077,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,29 +2125,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,28 +2175,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,28 +2224,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,28 +2273,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,28 +2322,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,28 +2371,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,28 +2420,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,28 +2469,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,28 +2518,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,28 +2567,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,28 +2616,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,28 +2665,27 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,28 +2714,76 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,28 +2812,76 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_ssrgemm_tt_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f32_srgemm_nt_t.cu similarity index 79% rename from bench/device/simt_maximum_plus_ssrgemm_tt_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f32_srgemm_nt_t.cu index d25b3ce..8ae4e4d 100644 --- a/bench/device/simt_maximum_plus_ssrgemm_tt_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f32_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1293,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1342,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,30 +1390,29 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1587,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1636,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1685,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,29 +1881,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,29 +1930,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,29 +1979,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,29 +2028,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,29 +2077,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,30 +2125,29 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,29 +2175,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,29 +2224,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,29 +2273,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,29 +2322,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,29 +2371,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,29 +2420,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,29 +2469,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,29 +2518,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,29 +2567,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,29 +2616,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,29 +2665,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,29 +2714,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,29 +2812,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_nt_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_ssrgemm_nt_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f32_srgemm_tn_n.cu similarity index 79% rename from bench/device/simt_maximum_plus_ssrgemm_nt_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f32_srgemm_tn_n.cu index feb5e4c..3679d1d 100644 --- a/bench/device/simt_maximum_plus_ssrgemm_nt_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f32_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1293,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1342,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,30 +1390,29 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1587,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1636,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1685,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,29 +1881,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,29 +1930,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,29 +1979,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,29 +2028,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,29 +2077,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,30 +2125,29 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,29 +2175,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,29 +2224,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,29 +2273,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,29 +2322,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,29 +2371,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,29 +2420,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,29 +2469,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,29 +2518,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,29 +2567,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,29 +2616,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,29 +2665,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,29 +2714,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,29 +2812,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_nt_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_ssrgemm_tt_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f32_srgemm_tn_t.cu similarity index 79% rename from bench/device/simt_minimum_plus_ssrgemm_tt_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f32_srgemm_tn_t.cu index 15e1bc6..54ac2e8 100644 --- a/bench/device/simt_minimum_plus_ssrgemm_tt_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f32_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1293,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1342,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,30 +1390,29 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1587,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1636,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1685,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,29 +1881,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,29 +1930,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,29 +1979,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,29 +2028,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,29 +2077,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,30 +2125,29 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,29 +2175,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,29 +2224,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,29 +2273,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,29 +2322,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,29 +2371,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,29 +2420,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,29 +2469,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,29 +2518,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,29 +2567,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,29 +2616,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,29 +2665,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,29 +2714,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,29 +2812,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tn_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_ssrgemm_tn_t_sm50.cu b/bench/device/sm50_simt_plus_mult_f32_srgemm_tt_n.cu similarity index 79% rename from bench/device/simt_maximum_plus_ssrgemm_tn_t_sm50.cu rename to bench/device/sm50_simt_plus_mult_f32_srgemm_tt_n.cu index 38a96a4..4a76893 100644 --- a/bench/device/simt_maximum_plus_ssrgemm_tn_t_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f32_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1283,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1293,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1342,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1381,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,30 +1390,29 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1587,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1626,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1636,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1675,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1685,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1859,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1909,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,29 +1881,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1959,7 +1920,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,29 +1930,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,29 +1979,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2059,7 +2018,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,29 +2028,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2109,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,29 +2077,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2159,7 +2116,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,30 +2125,29 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2209,7 +2165,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,29 +2175,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2259,7 +2214,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,29 +2224,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2309,7 +2263,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,29 +2273,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2359,7 +2312,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,29 +2322,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2409,7 +2361,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,29 +2371,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2459,7 +2410,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,29 +2420,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2509,7 +2459,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,29 +2469,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2559,7 +2508,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,29 +2518,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2609,7 +2557,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,29 +2567,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2659,7 +2606,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,29 +2616,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2709,7 +2655,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,29 +2665,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2759,7 +2704,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,29 +2714,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2809,7 +2802,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,29 +2812,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2859,7 +2900,7 @@ static void BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_ssrgemm_tn_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_n_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_ssrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_plus_mult_f32_srgemm_tt_t.cu similarity index 79% rename from bench/device/simt_minimum_plus_ssrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_plus_mult_f32_srgemm_tt_t.cu index bcb72e4..1d6c973 100644 --- a/bench/device/simt_minimum_plus_ssrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_32x64x1_8x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_64x32x1_8x8_8x4_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_32x64x1_8x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_64x32x1_8x8_8x4_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x64x1_8x8_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_64x32x1_8x8_8x4_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1293,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1342,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_32x64x1_8x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1381,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1418,27 +1390,26 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_64x32x1_8x8_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1587,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1626,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1636,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1675,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1685,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1819,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1859,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1869,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1909,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1919,26 +1881,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x256x8_32x64x1_8x8_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1959,7 +1920,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1969,26 +1930,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_64x32x1_8x8_8x4_2x4 // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2019,26 +1979,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2059,7 +2018,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2069,26 +2028,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2109,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2119,26 +2077,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2159,7 +2116,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2168,27 +2125,26 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2209,7 +2165,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2219,26 +2175,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2259,7 +2214,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2269,26 +2224,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_32x64x1_8x8_4x8_4x2 // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2309,7 +2263,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2319,26 +2273,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2359,7 +2312,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2369,26 +2322,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x64x8_64x32x1_8x8_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2409,7 +2361,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2419,26 +2371,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2459,7 +2410,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2469,26 +2420,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2509,7 +2459,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2519,26 +2469,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2559,7 +2508,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2569,26 +2518,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2609,7 +2557,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2619,26 +2567,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2659,7 +2606,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2669,26 +2616,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_64x256x8_16x64x1_4x8_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2709,7 +2655,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2719,26 +2665,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2759,7 +2704,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2769,26 +2714,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2809,7 +2802,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_128x256x8_32x64x1_8x8_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -2819,26 +2812,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_128x128x8_32x32x1_8x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2859,7 +2900,7 @@ static void BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_ssrgemm_tt_t_256x64x8_64x16x1_8x4_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f32_srgemm_tt_t_256x128x8_64x32x1_8x8_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_dsrgemm_nn_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f64_srgemm_nn_n.cu similarity index 72% rename from bench/device/simt_minimum_plus_dsrgemm_nn_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f64_srgemm_nn_n.cu index 4a89a3a..92bb196 100644 --- a/bench/device/simt_minimum_plus_dsrgemm_nn_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_dsrgemm_nt_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f64_srgemm_nn_t.cu similarity index 72% rename from bench/device/simt_maximum_plus_dsrgemm_nt_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f64_srgemm_nn_t.cu index cfed1c5..d02f5e9 100644 --- a/bench/device/simt_maximum_plus_dsrgemm_nt_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_dsrgemm_nn_t_sm50.cu b/bench/device/sm50_simt_plus_mult_f64_srgemm_nt_n.cu similarity index 71% rename from bench/device/simt_minimum_plus_dsrgemm_nn_t_sm50.cu rename to bench/device/sm50_simt_plus_mult_f64_srgemm_nt_n.cu index 36a8691..55912d7 100644 --- a/bench/device/simt_minimum_plus_dsrgemm_nn_t_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f64_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_dsrgemm_nt_t_sm50.cu b/bench/device/sm50_simt_plus_mult_f64_srgemm_nt_t.cu similarity index 71% rename from bench/device/simt_maximum_plus_dsrgemm_nt_t_sm50.cu rename to bench/device/sm50_simt_plus_mult_f64_srgemm_nt_t.cu index 327485e..5cb323d 100644 --- a/bench/device/simt_maximum_plus_dsrgemm_nt_t_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f64_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_nt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_dsrgemm_nn_t_sm50.cu b/bench/device/sm50_simt_plus_mult_f64_srgemm_tn_n.cu similarity index 71% rename from bench/device/simt_maximum_plus_dsrgemm_nn_t_sm50.cu rename to bench/device/sm50_simt_plus_mult_f64_srgemm_tn_n.cu index dcac1f3..f57edd6 100644 --- a/bench/device/simt_maximum_plus_dsrgemm_nn_t_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f64_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,29 +19,28 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,29 +68,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,29 +117,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,29 +166,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,29 +215,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,29 +264,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,29 +313,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,29 +362,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,29 +411,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,29 +460,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,29 +509,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,29 +558,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,29 +607,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,29 +656,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,29 +705,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,29 +754,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,29 +803,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,29 +852,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,29 +901,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,29 +950,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,29 +999,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,29 +1048,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,29 +1097,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,29 +1146,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,29 +1195,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,29 +1244,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1309,7 +1332,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,29 +1342,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,29 +1440,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,29 +1489,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,29 +1538,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,29 +1587,126 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1559,7 +1724,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,29 +1734,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,29 +1783,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,29 +1832,28 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,29 +1881,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,29 +1979,77 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_nn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_dsrgemm_tn_t_sm50.cu b/bench/device/sm50_simt_plus_mult_f64_srgemm_tn_t.cu similarity index 71% rename from bench/device/simt_maximum_plus_dsrgemm_tn_t_sm50.cu rename to bench/device/sm50_simt_plus_mult_f64_srgemm_tn_t.cu index 42af41e..1b2795a 100644 --- a/bench/device/simt_maximum_plus_dsrgemm_tn_t_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f64_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tn_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tn_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_maximum_plus_dsrgemm_tt_n_sm50.cu b/bench/device/sm50_simt_plus_mult_f64_srgemm_tt_n.cu similarity index 71% rename from bench/device/simt_maximum_plus_dsrgemm_tt_n_sm50.cu rename to bench/device/sm50_simt_plus_mult_f64_srgemm_tt_n.cu index 7ccb50d..83b98bf 100644 --- a/bench/device/simt_maximum_plus_dsrgemm_tt_n_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f64_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_maximum_plus_dsrgemm_tt_n_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_n_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/simt_minimum_plus_dsrgemm_tt_t_sm50.cu b/bench/device/sm50_simt_plus_mult_f64_srgemm_tt_t.cu similarity index 71% rename from bench/device/simt_minimum_plus_dsrgemm_tt_t_sm50.cu rename to bench/device/sm50_simt_plus_mult_f64_srgemm_tt_t.cu index 0ed8bc8..43e4292 100644 --- a/bench/device/simt_minimum_plus_dsrgemm_tt_t_sm50.cu +++ b/bench/device/sm50_simt_plus_mult_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ////////////////////////////////////////////////////////////////////// // THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -19,26 +19,25 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -59,7 +58,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -69,26 +68,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x32x8_8x32x1_2x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -109,7 +107,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -119,26 +117,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -159,7 +156,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -169,26 +166,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,7 +205,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -219,26 +215,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -259,7 +254,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -269,26 +264,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x32x8_8x16x1_2x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -309,7 +303,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2(b state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -319,26 +313,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_8x64x8_8x32x1_2x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -359,7 +352,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -369,26 +362,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -409,7 +401,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -419,26 +411,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -459,7 +450,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -469,26 +460,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -509,7 +499,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -519,26 +509,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -559,7 +548,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -569,26 +558,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -609,7 +597,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -619,26 +607,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -659,7 +646,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -669,26 +656,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -709,7 +695,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -719,26 +705,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -759,7 +744,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -769,26 +754,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -809,7 +793,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -819,26 +803,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -859,7 +842,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -869,26 +852,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -909,7 +891,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -919,26 +901,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -959,7 +940,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -969,26 +950,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1009,7 +989,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1019,26 +999,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1059,7 +1038,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1069,26 +1048,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,7 +1087,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1119,26 +1097,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1159,7 +1136,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1169,26 +1146,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1209,7 +1185,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1219,26 +1195,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1259,7 +1234,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1269,26 +1244,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1309,7 +1283,56 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x256x8_16x64x1_4x8_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1319,26 +1342,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1359,7 +1430,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x128x8_32x32x1_8x4_4x8_2x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1369,26 +1440,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1409,7 +1479,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2( state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1419,26 +1489,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1459,7 +1528,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1469,26 +1538,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1509,7 +1577,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1519,26 +1587,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 0) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1559,7 +1675,56 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x64x8_32x32x1_8x4_4x8_4x2) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_256x32x8_64x16x1_8x4_8x4_4x2) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1569,26 +1734,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1609,7 +1773,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1619,26 +1783,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1659,7 +1822,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1669,26 +1832,25 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1709,7 +1871,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1719,26 +1881,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1759,7 +1969,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4 state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_64x128x8_16x32x1_4x4_4x8_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif @@ -1769,26 +1979,74 @@ BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 2) -static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4(benchmark::State &state) { const auto N = static_cast(state.range(0)); using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) + ->RangeMultiplier(2)->Range(256, 4096); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_BENCH_LEVEL) and (CUASR_BENCH_LEVEL >= 1) +static void BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1809,7 +2067,7 @@ static void BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x state.counters["Flop/s"] = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); } -BENCHMARK(BM_SM50_device_minimum_plus_dsrgemm_tt_t_128x32x16_32x8x1_4x2_8x4_4x4) +BENCHMARK(BM_SM50_device_plus_mult_f64_srgemm_tt_t_128x64x8_32x16x1_4x4_8x4_4x4) ->RangeMultiplier(2)->Range(256, 4096); #endif diff --git a/bench/device/sm80_defaults.cu b/bench/device/sm80_defaults.cu new file mode 100644 index 0000000..f38103e --- /dev/null +++ b/bench/device/sm80_defaults.cu @@ -0,0 +1,6158 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +////////////////////////////////////////////////////////////////////// +// THIS BENCHMARK FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +////////////////////////////////////////////////////////////////////// + +#include "benchmark/benchmark.h" + +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +#include "harness.h" + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_plus_mult_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_plus_mult_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_plus_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_plus_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_plus_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_plus_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_max_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_max_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_min_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_min_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_min_mult_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_min_mult_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_max_mult_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_max_mult_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f64_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f64_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f64_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f64_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f64_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f64_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f64_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f64_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f64_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f64_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f64_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f64_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f64_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f64_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f64_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f64_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_f32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_f32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_s32_srgemm_tt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_s32_srgemm_tt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_s32_srgemm_tt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_s32_srgemm_tt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_s32_srgemm_tn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_s32_srgemm_tn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_s32_srgemm_tn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_s32_srgemm_tn_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_s32_srgemm_nt_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_s32_srgemm_nt_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_s32_srgemm_nt_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_s32_srgemm_nt_t) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_s32_srgemm_nn_n(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_s32_srgemm_nn_n) + ->RangeMultiplier(2)->Range(256, 4096); + +/////////////////////////////////////////////////////////////////////////////// + +static void BM_SM80_default_or_and_s32_srgemm_nn_t(benchmark::State &state) { + const auto N = static_cast(state.range(0)); + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + // setup bench harness + cuasr::bench::device::BenchHarness bench({ N, N, N }); + + // benchmark loop + for (auto _ : state) { + benchmark::DoNotOptimize(bench.run()); + cudaDeviceSynchronize(); + } + + double flops_per_itr = 2.0 * N * N * N; + state.counters["Flop/s"] + = benchmark::Counter(flops_per_itr, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(BM_SM80_default_or_and_s32_srgemm_nn_t) + ->RangeMultiplier(2)->Range(256, 4096); diff --git a/cutlass b/cutlass index c2b80ad..e45e773 160000 --- a/cutlass +++ b/cutlass @@ -1 +1 @@ -Subproject commit c2b80ad4e4f8b60a65500bd04c8fecddff2ba355 +Subproject commit e45e77343693e261f9285ee05be4f0498848e5a5 diff --git a/examples/00_minplus_srgemm/CMakeLists.txt b/examples/00_minplus_srgemm/CMakeLists.txt index 3ea6fe6..2677001 100644 --- a/examples/00_minplus_srgemm/CMakeLists.txt +++ b/examples/00_minplus_srgemm/CMakeLists.txt @@ -5,6 +5,7 @@ target_include_directories(minplus_srgemm PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/cutlass/include + ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include ${CUDA_INCLUDE_DIRS} ) target_compile_options(minplus_srgemm diff --git a/examples/00_minplus_srgemm/minplus_srgemm.cu b/examples/00_minplus_srgemm/minplus_srgemm.cu index d15d973..cbf245b 100644 --- a/examples/00_minplus_srgemm/minplus_srgemm.cu +++ b/examples/00_minplus_srgemm/minplus_srgemm.cu @@ -20,33 +20,31 @@ auto cuasr_minplus_srsgemm_nt_n( bool do_epilogue_min, cudaStream_t stream = nullptr) -> int { // compile time configuration of this srgemm kernel using OperatorClass - using OperatorClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - using AdditionOp = cuasr::minimum; - using MultiplicationOp = cuasr::plus; + using OperatorClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using TropicalConfig = typename cuasr::gemm::device::DefaultSemiRingConfiguration< - float, float, float, float, OperatorClass, // - AdditionOp, MultiplicationOp, SmArch>; + float, float, float, float, RingOp, + OperatorClass, SmArch>; using ColumnMajor = cutlass::layout::ColumnMajor; using RowMajor = cutlass::layout::RowMajor; using cuASR_MinPlus_SGEMM = cuasr::gemm::device::Srgemm< - AdditionOp, // Thread level SemiRing operator - MultiplicationOp, // Thread level SemiRing operator + RingOp, // SemiRing operator float, // element type of A ColumnMajor, // layout of A float, // element type of B RowMajor, // layout of B float, // element t ype of C - ColumnMajor, // layout of C + ColumnMajor, // layout of C float // element type of D >; - float alpha = MultiplicationOp::Identity; + float alpha = RingOp::AddIdentity; float beta - = do_epilogue_min ? MultiplicationOp::Identity : MultiplicationOp::Annihilator; + = do_epilogue_min ? RingOp::MultIdentity : 0; // construct kernel arguments struct cuASR_MinPlus_SGEMM::Arguments args( @@ -122,9 +120,9 @@ int main() { auto start = high_resolution_clock::now(); for (int i = 0; i < repeats; ++i) { retval |= cuasr_minplus_srsgemm_nt_n(M, N, K, d_A, M, d_B, K, d_C, M, true, nullptr); - cudaDeviceSynchronize(); } - auto end = high_resolution_clock::now(); + cudaDeviceSynchronize(); + auto end = high_resolution_clock::now(); auto delta = duration_cast(end - start).count(); if (retval) { diff --git a/examples/01_userdefined_semiring/userdefined_semiring.cu b/examples/01_userdefined_semiring/userdefined_semiring.cu index 601e8ac..7f292a7 100644 --- a/examples/01_userdefined_semiring/userdefined_semiring.cu +++ b/examples/01_userdefined_semiring/userdefined_semiring.cu @@ -51,54 +51,31 @@ * multi-stage pipelined SRGEMM is planned for the future. */ -// clang-format off namespace { -template -struct binary_xor { - static T constexpr Identity = static_cast(false); - // expose base scalar operator - __host__ __device__ - T operator()(T lhs, T const &rhs) const { - lhs ^= rhs; - return lhs; - } +template +struct xor_and { + static T constexpr AddIdentity = static_cast(false); + static T constexpr MultIdentity = static_cast(true); + static T constexpr MultAnnihilator = static_cast(false); __host__ __device__ - cutlass::Array - operator()(cutlass::Array const &lhs, cutlass::Array const &rhs) const { - cutlass::Array result; - #pragma unroll - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], rhs[i]); - } - return result; + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); } __host__ __device__ - cutlass::Array - operator()(cutlass::Array const &lhs, T const &scalar) const { - cutlass::Array result; - #pragma unroll - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], scalar); - } - return result; + T add(T const lhs, T const rhs) const { + return lhs ^ rhs; } __host__ __device__ - cutlass::Array - operator()(T const &scalar, cutlass::Array const &rhs) const { - cutlass::Array result; - #pragma unroll - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(scalar, rhs[i]); - } - return result; + T mult(T const lhs, T const rhs) const { + return lhs && rhs; } }; + } // namespace -// clang-format on // GF(2) xor-and SRGEMM auto cuasr_gf_srgemm_nnn( @@ -118,10 +95,11 @@ auto cuasr_gf_srgemm_nnn( using OperatorClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; - using AdditionOp = binary_xor; - using MultiplicationOp = cuasr::binary_and; + // using AdditionOp = binary_xor; + // using MultiplicationOp = cuasr::binary_and; + using RingOp = xor_and; using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, int, 1>; + RingOp, int, 1>; static int constexpr AlignmentA = 1; static int constexpr AlignmentB = 1; @@ -135,8 +113,7 @@ auto cuasr_gf_srgemm_nnn( using RowMajor = cutlass::layout::RowMajor; using cuASRGaloisFieldSrgemm = cuasr::gemm::device::Srgemm< - AdditionOp, // Thread level SemiRing operator - MultiplicationOp, // Thread level SemiRing operator + RingOp, int, // element type of A RowMajor, // layout of A int, // element type of B @@ -157,8 +134,8 @@ auto cuasr_gf_srgemm_nnn( false // SplitKSerial >; - int alpha = MultiplicationOp::Identity; - int beta = do_epilogue_and ? MultiplicationOp::Identity : MultiplicationOp::Annihilator; + int alpha = RingOp::MultIdentity; + int beta = do_epilogue_and ? RingOp::MultIdentity : RingOp::MultAnnihilator; // construct kernel arguments struct cuASRGaloisFieldSrgemm::Arguments args( @@ -214,15 +191,13 @@ auto compare_host_reference( int ldc, int *reference_D, int *device_D) -> bool { - using AdditionOp = binary_xor; - using MultiplicationOp = cuasr::binary_and; + using RingOp = xor_and; using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, int, 1>; + RingOp, int, 1>; using RowMajor = cutlass::layout::RowMajor; cuasr::reference::host::Srgemm< - AdditionOp, // - MultiplicationOp, // + RingOp, // int, RowMajor, // int, RowMajor, // int, RowMajor, // @@ -235,24 +210,40 @@ auto compare_host_reference( { M, N, K }, // alpha, { A, lda }, { B, ldb }, // beta, { C, ldc }, { reference_D, ldc }, // - AdditionOp::Identity); + RingOp::AddIdentity); - auto is_correct = true; + auto is_wrong = false; + int counter = 0; for (int n = 0; n < N; ++n) { for (int m = 0; m < M; ++m) { - is_correct &= (reference_D[(ldc * n) + m] == device_D[(ldc * n) + m]); + auto incorrect = (reference_D[(ldc * n) + m] != device_D[(ldc * n) + m]); + is_wrong |= incorrect; + if (incorrect && counter < 10) { + std::cout << '[' << n << ',' << m << ']' + << " Expected = " << reference_D[(ldc * n) + m] + << " Computed = " << device_D[(ldc * n) + m] << '\n'; + counter++; + } } } - return is_correct; + return (not is_wrong); } - -int main() { +int main(int argc, char* argv[]) { using namespace std::chrono; // problem size - constexpr int M = 512; // 4096 - constexpr int N = 512; - constexpr int K = 512; + int M = 512; + if (argc > 2) { + M = std::atoi(argv[1]); + } + int N = 512; + if (argc > 3) { + N = std::atoi(argv[2]); + } + int K = 512; + if (argc > 4) { + K = std::atoi(argv[3]); + } constexpr bool do_epilogue_and = true; std::cout << "Running Xor-And Galois Field SRGEMM on A = " << M << 'x' << K @@ -281,12 +272,10 @@ int main() { cudaMemcpy(d_B, B, sizeof(int) * K * N, cudaMemcpyHostToDevice); cudaMemcpy(d_C, C, sizeof(int) * M * N, cudaMemcpyHostToDevice); - auto start = high_resolution_clock::now(); - - auto retval - = cuasr_gf_srgemm_nnn(M, N, K, d_A, M, d_B, K, d_C, M, do_epilogue_and, nullptr); - retval |= cudaDeviceSynchronize(); - auto end = high_resolution_clock::now(); + auto start = high_resolution_clock::now(); + auto retval = cuasr_gf_srgemm_nnn(M, N, K, d_A, M, d_B, K, d_C, M, do_epilogue_and, nullptr); + retval |= cudaDeviceSynchronize(); + auto end = high_resolution_clock::now(); duration delta = (end - start); if (retval) { @@ -300,10 +289,10 @@ int main() { cudaMemcpy(device_D, d_C, sizeof(int) * M * N, cudaMemcpyDeviceToHost); // compare against host - std::cout << "Comparing against reference host-side SRGEMM : "; - int alpha = cuasr::binary_and::Identity; - int beta = do_epilogue_and ? cuasr::binary_and::Identity - : cuasr::binary_and::Annihilator; + std::cout << "Comparing against reference host-side SRGEMM :\n"; + int alpha = xor_and::AddIdentity; + int beta = do_epilogue_and ? xor_and::MultIdentity + : xor_and::MultAnnihilator; auto is_correct = compare_host_reference( M, N, K, alpha, A, M, B, N, beta, C, M, reference_D, device_D); diff --git a/examples/02_splitk_srgemm/splitk_srgemm.cu b/examples/02_splitk_srgemm/splitk_srgemm.cu index bba7d46..d749fcc 100644 --- a/examples/02_splitk_srgemm/splitk_srgemm.cu +++ b/examples/02_splitk_srgemm/splitk_srgemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. **************************************************************************************************/ #include @@ -29,21 +29,19 @@ auto cuasr_splitk_minplus_srsgemm_tn_t( int split_k_slices, cudaStream_t stream = nullptr) -> int { // compile time configuration of this srgemm kernel using OperatorClass - using OperatorClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - using AdditionOp = cuasr::minimum; - using MultiplicationOp = cuasr::plus; + using OperatorClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using TropicalConfig = typename cuasr::gemm::device::DefaultSemiRingConfiguration< float, float, float, float, OperatorClass, // - AdditionOp, MultiplicationOp, SmArch>; + RingOp, SmArch>; using ColumnMajor = cutlass::layout::ColumnMajor; using RowMajor = cutlass::layout::RowMajor; using cuASR_SplitK_SRGEMM = cuasr::gemm::device::SrgemmSplitKParallel< - AdditionOp, // Thread level SemiRing operator - MultiplicationOp, // Thread level SemiRing operator + RingOp, float, // element type of A RowMajor, // layout of A float, // element type of B @@ -54,9 +52,9 @@ auto cuasr_splitk_minplus_srsgemm_tn_t( >; // setup runtime configuration - float alpha = MultiplicationOp::Identity; + float alpha = RingOp::MultIdentity; float beta - = do_epilogue_min ? MultiplicationOp::Identity : MultiplicationOp::Annihilator; + = do_epilogue_min ? RingOp::MultIdentity : RingOp::MultAnnihilator; // construct kernel arguments struct cuASR_SplitK_SRGEMM::Arguments args( @@ -116,21 +114,19 @@ auto cuasr_minplus_srsgemm_tn_t( bool do_epilogue_min, cudaStream_t stream = nullptr) -> int { // compile time configuration of this srgemm kernel using OperatorClass - using OperatorClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - using AdditionOp = cuasr::minimum; - using MultiplicationOp = cuasr::plus; + using OperatorClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using TropicalConfig = typename cuasr::gemm::device::DefaultSemiRingConfiguration< float, float, float, float, OperatorClass, // - AdditionOp, MultiplicationOp, SmArch>; + RingOp, SmArch>; using ColumnMajor = cutlass::layout::ColumnMajor; using RowMajor = cutlass::layout::RowMajor; using cuASR_MinPlus_SGEMM = cuasr::gemm::device::Srgemm< - AdditionOp, // Thread level SemiRing operator - MultiplicationOp, // Thread level SemiRing operator + RingOp, float, // element type of A RowMajor, // layout of A float, // element type of B @@ -140,9 +136,9 @@ auto cuasr_minplus_srsgemm_tn_t( float // element type of D >; - float alpha = MultiplicationOp::Identity; + float alpha = RingOp::MultIdentity; float beta - = do_epilogue_min ? MultiplicationOp::Identity : MultiplicationOp::Annihilator; + = do_epilogue_min ? RingOp::MultIdentity : RingOp::MultAnnihilator; // construct kernel arguments struct cuASR_MinPlus_SGEMM::Arguments args( diff --git a/include/cuasr/arch/functional.h b/include/cuasr/arch/functional.h new file mode 100644 index 0000000..9250f17 --- /dev/null +++ b/include/cuasr/arch/functional.h @@ -0,0 +1,82 @@ +/*************************************************************************************************** + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + **************************************************************************************************/ +#pragma once + +namespace cuasr { +namespace arch { + +/////////////////////////////////////////////////////////////////////////////// + +__host__ __device__ inline +int min(int lhs, int rhs) { +#if defined(__CUDA_ARCH__) + int ret; + asm("min.s32 %0, %1, %2;\n" : "=r"(ret) : "r"(lhs), "r"(rhs)); + return ret; +#else + return (lhs < rhs) ? lhs : rhs; +#endif +} + +__host__ __device__ inline +float min(float lhs, float rhs) { +#if defined(__CUDA_ARCH__) + float ret; + asm("min.f32 %0, %1, %2;\n" : "=f"(ret) : "f"(lhs), "f"(rhs)); + return ret; +#else + return (lhs < rhs) ? lhs : rhs; +#endif +} + +__host__ __device__ inline +double min(double lhs, double rhs) { +#if defined(__CUDA_ARCH__) + double ret; + asm("min.f64 %0, %1, %2;\n" : "=d"(ret) : "d"(lhs), "d"(rhs)); + return ret; +#else + return (lhs < rhs) ? lhs : rhs; +#endif +} + +/////////////////////////////////////////////////////////////////////////////// + +__host__ __device__ inline +int max(int lhs, int rhs) { +#if defined(__CUDA_ARCH__) + int ret; + asm("max.s32 %0, %1, %2;\n" : "=r"(ret) : "r"(lhs), "r"(rhs)); + return ret; +#else + return (lhs > rhs) ? lhs : rhs; +#endif +} + +__host__ __device__ inline +float max(float lhs, float rhs) { +#if defined(__CUDA_ARCH__) + float ret; + asm("max.f32 %0, %1, %2;\n" : "=f"(ret) : "f"(lhs), "f"(rhs)); + return ret; +#else + return (lhs > rhs) ? lhs : rhs; +#endif +} + +__host__ __device__ inline +double max(double lhs, double rhs) { +#if defined(__CUDA_ARCH__) + double ret; + asm("max.f64 %0, %1, %2;\n" : "=d"(ret) : "d"(lhs), "d"(rhs)); + return ret; +#else + return (lhs > rhs) ? lhs : rhs; +#endif +} + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace arch +} // namespace cuasr diff --git a/include/cuasr/arch/srmma.h b/include/cuasr/arch/srmma.h index cf35181..ef1dd39 100644 --- a/include/cuasr/arch/srmma.h +++ b/include/cuasr/arch/srmma.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Templates exposing architecture support for multiply-add operations @@ -34,10 +62,8 @@ template < typename ElementC, /// Layout of C matrix (concept: MatrixLayout) typename LayoutC, - /// addition operator of the semi-ring - typename AdditionOp, - /// multiplication operator of the semi-ring - typename MultiplicationOp + /// Ring operator that performa FMA + typename RingOp > struct Srmma; @@ -58,10 +84,8 @@ template < typename ElementC, /// Layout of C matrix (concept: MatrixLayout) typename LayoutC, - /// Addition operator of the semi-ring - typename AdditionOp, - /// Multiplication operator of the semi-ring - typename MultiplicationOp> + /// Ring operator that performa FMA + typename RingOp> struct Srmma< cutlass::gemm::GemmShape<1, 1, 1>, 1, @@ -71,14 +95,12 @@ struct Srmma< LayoutB, ElementC, LayoutC, - AdditionOp, - MultiplicationOp> { + RingOp> { using Shape = cutlass::gemm::GemmShape<1, 1, 1>; // semi-ring operators must be default contructible and // have a binary invocation () operator - AdditionOp add; - MultiplicationOp mult; + RingOp ring_op; CUTLASS_HOST_DEVICE void operator()( @@ -87,7 +109,7 @@ struct Srmma< cutlass::Array const &b, cutlass::Array const &c ) { - d[0] = add(c[0], mult(a[0], b[0])); + ring_op.fma(d[0], a[0], b[0], c[0]); } }; diff --git a/include/cuasr/functional.h b/include/cuasr/functional.h index b8a2c3e..f4663a6 100644 --- a/include/cuasr/functional.h +++ b/include/cuasr/functional.h @@ -1,15 +1,43 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Defines basic semi-ring reels together with their identity and annihilator constants given type T. - - This is inspired by the Standard Library's header. */ #pragma once +#include "cuasr/arch/functional.h" + #include "cutlass/array.h" #include "cutlass/cutlass.h" @@ -34,252 +62,202 @@ template constexpr auto get_neginf() noexcept { return std::numeric_limits::min(); } -} -template -struct plus { - static T constexpr Identity = static_cast(0); - static T constexpr Annihilator = get_inf(); +} // namespace + +/////////////////////////////////////////////////////////////////////////////// - // scalar operator - CUTLASS_HOST_DEVICE - T operator()(T lhs, T const &rhs) const { - lhs += rhs; - return lhs; +// Regular FMA +template +struct plus_mult { + static T constexpr AddIdentity = static_cast(0); + static T constexpr MultIdentity = static_cast(1); + static T constexpr MultAnnihilator = static_cast(0); + + __host__ __device__ + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); } - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], rhs[i]); - } - return result; + __host__ __device__ + T add(T const lhs, T const rhs) const { + return lhs + rhs; } - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, T const &scalar) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], scalar); - } - return result; + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return lhs * rhs; } +}; - CUTLASS_HOST_DEVICE - Array operator()(T const &scalar, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(scalar, rhs[i]); - } - return result; +/////////////////////////////////////////////////////////////////////////////// + +template +struct min_plus { + static T constexpr AddIdentity = get_inf(); + static T constexpr MultIdentity = static_cast(0); + static T constexpr MultAnnihilator = get_inf(); + + __host__ __device__ + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); + } + + __host__ __device__ + T add(T const lhs, T const rhs) const { + return cuasr::arch::min(lhs, rhs); + } + + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return lhs * rhs; } }; -template -struct multiplies { - static T constexpr Identity = static_cast(1); - static T constexpr Annihilator = static_cast(0); +/////////////////////////////////////////////////////////////////////////////// - // scalar operator - CUTLASS_HOST_DEVICE - T operator()(T lhs, T const &rhs) const { - lhs *= rhs; - return lhs; +template +struct max_plus { + static T constexpr AddIdentity = get_neginf(); + static T constexpr MultIdentity = static_cast(0); + static T constexpr MultAnnihilator = get_inf(); + + __host__ __device__ + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); } - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], rhs[i]); - } - return result; + __host__ __device__ + T add(T const lhs, T const rhs) const { + return cuasr::arch::max(lhs, rhs); } - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, T const &scalar) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], scalar); - } - return result; + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return lhs + rhs; } +}; + +/////////////////////////////////////////////////////////////////////////////// - CUTLASS_HOST_DEVICE - Array operator()(T const &scalar, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(scalar, rhs[i]); - } - return result; +template +struct min_mult { + static T constexpr AddIdentity = get_inf(); + static T constexpr MultIdentity = static_cast(1); + static T constexpr MultAnnihilator = static_cast(0); + + __host__ __device__ + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); + } + + __host__ __device__ + T add(T const lhs, T const rhs) const { + return cuasr::arch::min(lhs, rhs); + } + + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return lhs * rhs; } }; -template -struct minimum { - static T constexpr Identity = get_inf(); - static T constexpr Annihilator = get_neginf(); - - // scalar operator - CUTLASS_HOST_DEVICE - T operator()(T const &lhs, T const &rhs) const { return (rhs < lhs ? rhs : lhs); } - - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], rhs[i]); - } - return result; +/////////////////////////////////////////////////////////////////////////////// + +template +struct max_mult { + static T constexpr AddIdentity = get_neginf(); + static T constexpr MultIdentity = static_cast(1); + static T constexpr MultAnnihilator = static_cast(0); + + __host__ __device__ + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); } - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, T const &scalar) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], scalar); - } - return result; + __host__ __device__ + T add(T const lhs, T const rhs) const { + return cuasr::arch::max(lhs, rhs); } - CUTLASS_HOST_DEVICE - Array operator()(T const &scalar, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(scalar, rhs[i]); - } - return result; + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return lhs * rhs; } }; -template -struct maximum { - static T constexpr Identity = get_neginf(); - static T constexpr Annihilator = get_inf(); - - // scalar operator - CUTLASS_HOST_DEVICE - T operator()(T const &lhs, T const &rhs) const { return (lhs < rhs ? rhs : lhs); } - - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], rhs[i]); - } - return result; +/////////////////////////////////////////////////////////////////////////////// + +template +struct min_max { + static T constexpr AddIdentity = get_inf(); + static T constexpr MultIdentity = get_neginf(); + static T constexpr MultAnnihilator = get_inf(); + + __host__ __device__ + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); } - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, T const &scalar) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], scalar); - } - return result; + __host__ __device__ + T add(T const lhs, T const rhs) const { + return cuasr::arch::min(lhs, rhs); } - CUTLASS_HOST_DEVICE - Array operator()(T const &scalar, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(scalar, rhs[i]); - } - return result; + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return cuasr::arch::max(lhs, rhs); } }; -template -struct binary_and { - static T constexpr Identity = static_cast(true); - static T constexpr Annihilator = static_cast(false); - - // scalar operator - CUTLASS_HOST_DEVICE - T operator()(T lhs, T const &rhs) const { return lhs && rhs; } - - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], rhs[i]); - } - return result; +/////////////////////////////////////////////////////////////////////////////// + +template +struct max_min { + static T constexpr AddIdentity = get_neginf(); + static T constexpr MultIdentity = get_inf(); + static T constexpr MultAnnihilator = get_neginf(); + + __host__ __device__ + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); } - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, T const &scalar) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], scalar); - } - return result; + __host__ __device__ + T add(T const lhs, T const rhs) const { + return cuasr::arch::max(lhs, rhs); } - CUTLASS_HOST_DEVICE - Array operator()(T const &scalar, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(scalar, rhs[i]); - } - return result; + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return cuasr::arch::min(lhs, rhs); } }; -template -struct binary_or { - static T constexpr Identity = static_cast(false); - static T constexpr Annihilator = static_cast(true); - - // scalar operator - CUTLASS_HOST_DEVICE - T operator()(T lhs, T const &rhs) const { return lhs || rhs; } - - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], rhs[i]); - } - return result; +/////////////////////////////////////////////////////////////////////////////// + +template +struct or_and { + static T constexpr AddIdentity = static_cast(0); + static T constexpr MultIdentity = static_cast(1); + static T constexpr MultAnnihilator = static_cast(0); + + __host__ __device__ + void fma(T& dst, T const lhs, T const rhs, T const src) const { + dst = add(src, mult(lhs, rhs)); } - CUTLASS_HOST_DEVICE - Array operator()(Array const &lhs, T const &scalar) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(lhs[i], scalar); - } - return result; + __host__ __device__ + T add(T const lhs, T const rhs) const { + return lhs || rhs; } - CUTLASS_HOST_DEVICE - Array operator()(T const &scalar, Array const &rhs) const { - Array result; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < N; ++i) { - result[i] = this->operator()(scalar, rhs[i]); - } - return result; + __host__ __device__ + T mult(T const lhs, T const rhs) const { + return lhs && rhs; } }; +/////////////////////////////////////////////////////////////////////////////// + } // namespace cuasr diff --git a/include/cuasr/gemm/device/default_srgemm_configuration.h b/include/cuasr/gemm/device/default_srgemm_configuration.h index 3c9bdc8..3ea8f5f 100644 --- a/include/cuasr/gemm/device/default_srgemm_configuration.h +++ b/include/cuasr/gemm/device/default_srgemm_configuration.h @@ -1,5 +1,32 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************************/ /*! \file \brief Definitions for SRGEMM configuration structures. @@ -30,252 +57,66 @@ template < typename ElementB, typename ElementC, typename ElementAccumulator, + typename RingOp, typename OperatorClass, - typename AdditionOp, - typename MultiplicationOp, typename ArchTag > struct DefaultSemiRingConfiguration; +//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////// SM 50 ////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// -// Plus-Times semi-ring GEMM configuration -// this is the traditional GEMM -template < - typename Element, - typename ArchTag -> -struct DefaultSemiRingConfiguration< - Element, - Element, - Element, - Element, - cutlass::arch::OpClassSimt, - cuasr::plus, - cuasr::multiplies, - ArchTag> { - - static int constexpr kAlignmentA = 1; - static int constexpr kAlignmentB = 1; - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - static int constexpr kStages = 2; - - using AdditionOp = cuasr::plus; - using MultiplicationOp = cuasr::multiplies; - - using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, Element, 1>; -}; - -// Min-Plus (tropical) semi-ring GEMM configuration -// example application: All Pairs Shorted Path -template < - typename Element, - typename ArchTag -> +template struct DefaultSemiRingConfiguration< Element, Element, Element, Element, + RingOp_, cutlass::arch::OpClassSimt, - cuasr::minimum, - cuasr::plus, ArchTag> { static int constexpr kAlignmentA = 1; static int constexpr kAlignmentB = 1; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - static int constexpr kStages = 2; - - using AdditionOp = cuasr::minimum; - using MultiplicationOp = cuasr::plus; - - using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, Element, 1>; -}; - -// Max-Plus semi-ring GEMM configuration -// example application: Viterbi algorithm -template < - typename Element, - typename ArchTag -> -struct DefaultSemiRingConfiguration< - Element, - Element, - Element, - Element, - cutlass::arch::OpClassSimt, - cuasr::maximum, - cuasr::plus, - ArchTag> { - - static int constexpr kAlignmentA = 1; - static int constexpr kAlignmentB = 1; - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - static int constexpr kStages = 2; - - using AdditionOp = cuasr::maximum; - using MultiplicationOp = cuasr::plus; - - using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, Element, 1>; -}; - -// Max-Min -template < - typename Element, - typename ArchTag -> -struct DefaultSemiRingConfiguration< - Element, - Element, - Element, - Element, - cutlass::arch::OpClassSimt, - cuasr::maximum, - cuasr::minimum, - ArchTag> { - - static int constexpr kAlignmentA = 1; - static int constexpr kAlignmentB = 1; - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - static int constexpr kStages = 2; - - using AdditionOp = cuasr::maximum; - using MultiplicationOp = cuasr::minimum; - - using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, Element, 1>; -}; - -// Min-Max -template < - typename Element, - typename ArchTag -> -struct DefaultSemiRingConfiguration< - Element, - Element, - Element, - Element, - cutlass::arch::OpClassSimt, - cuasr::minimum, - cuasr::maximum, - ArchTag> { - - static int constexpr kAlignmentA = 1; - static int constexpr kAlignmentB = 1; - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - static int constexpr kStages = 2; - - using AdditionOp = cuasr::minimum; - using MultiplicationOp = cuasr::maximum; - - using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, Element, 1>; -}; - -// Min-Times -template < - typename Element, - typename ArchTag -> -struct DefaultSemiRingConfiguration< - Element, - Element, - Element, - Element, - cutlass::arch::OpClassSimt, - cuasr::minimum, - cuasr::multiplies, - ArchTag> { - - static int constexpr kAlignmentA = 1; - static int constexpr kAlignmentB = 1; - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; static int constexpr kStages = 2; - using AdditionOp = cuasr::minimum; - using MultiplicationOp = cuasr::multiplies; + using RingOp = RingOp_; using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, Element, 1>; + RingOp, Element, 1>; }; -// Max-Times -template < - typename Element, - typename ArchTag -> -struct DefaultSemiRingConfiguration< - Element, - Element, - Element, - Element, - cutlass::arch::OpClassSimt, - cuasr::maximum, - cuasr::multiplies, - ArchTag> { - - static int constexpr kAlignmentA = 1; - static int constexpr kAlignmentB = 1; - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - static int constexpr kStages = 2; - - using AdditionOp = cuasr::maximum; - using MultiplicationOp = cuasr::multiplies; - - using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, Element, 1>; -}; +//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////// SM 80 ////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// -// Or-And boolean ring -template < - typename Element, - typename ArchTag -> +template struct DefaultSemiRingConfiguration< Element, Element, Element, Element, + RingOp_, cutlass::arch::OpClassSimt, - cuasr::binary_or, - cuasr::binary_and, - ArchTag> { + cutlass::arch::Sm80> { static int constexpr kAlignmentA = 1; static int constexpr kAlignmentB = 1; - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - static int constexpr kStages = 2; + static int constexpr kStages = 3; - using AdditionOp = cuasr::binary_or; - using MultiplicationOp = cuasr::binary_and; + using RingOp = RingOp_; using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination< - AdditionOp, MultiplicationOp, Element, 1>; + RingOp, Element, 1>; }; -//////////////////////////////////////////////////////////////////////////////// - } // namespace device } // namespace gemm } // namespace cuasr diff --git a/include/cuasr/gemm/device/srgemm.h b/include/cuasr/gemm/device/srgemm.h index 50482ff..bba64de 100644 --- a/include/cuasr/gemm/device/srgemm.h +++ b/include/cuasr/gemm/device/srgemm.h @@ -1,5 +1,32 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************************/ /*! \file \brief Template for a pipelined Semiring GEMM kernel. Does not compute batching or support split-K. @@ -26,10 +53,8 @@ namespace device { //////////////////////////////////////////////////////////////////////////////// template < - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_, + /// Ring operation that performs FMA + typename RingOp_, /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand @@ -51,34 +76,34 @@ template < /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_ = typename DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::ThreadblockShape, + RingOp_, OperatorClass_, ArchTag_>::ThreadblockShape, /// Warp-level tile size (concept: GemmShape) typename WarpShape_ = typename DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::WarpShape, + RingOp_, OperatorClass_, ArchTag_>::WarpShape, /// Instruction-level tile size (concept: GemmShape) typename InstructionShape_ = typename DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::InstructionShape, + RingOp_, OperatorClass_, ArchTag_>::InstructionShape, /// Epilogue output operator typename EpilogueOutputOp_ = typename DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::EpilogueOutputOp, + RingOp_, OperatorClass_, ArchTag_>::EpilogueOutputOp, /// Threadblock-level swizzling operator typename ThreadblockSwizzle_ = typename cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::kStages, + RingOp_, OperatorClass_, ArchTag_>::kStages, /// Access granularity of A matrix in units of elements int AlignmentA = DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::kAlignmentA, + RingOp_, OperatorClass_, ArchTag_>::kAlignmentA, /// Access granularity of B matrix in units of elements int AlignmentB = DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::kAlignmentB, + RingOp_, OperatorClass_, ArchTag_>::kAlignmentB, /// If true, kernel supports split-K with serial reduction bool SplitKSerial = false > @@ -103,8 +128,7 @@ class Srgemm { using InstructionShape = InstructionShape_; using EpilogueOutputOp = EpilogueOutputOp_; using ThreadblockSwizzle = ThreadblockSwizzle_; - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + using RingOp = RingOp_; static int const kStages = Stages; static int const kAlignmentA = AlignmentA; static int const kAlignmentB = AlignmentB; @@ -113,6 +137,7 @@ class Srgemm { /// Define the kernel using SrgemmKernel = typename cuasr::gemm::kernel::DefaultSrgemm< + RingOp, ElementA, LayoutA, kAlignmentA, @@ -127,8 +152,6 @@ class Srgemm { ThreadblockShape, WarpShape, InstructionShape, - AdditionOp, - MultiplicationOp, EpilogueOutputOp, ThreadblockSwizzle, kStages, @@ -357,10 +380,8 @@ class Srgemm { }; template < - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_, + /// Ring operation that performs FMA + typename RingOp_, /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand @@ -395,12 +416,12 @@ template < int AlignmentB, /// If true, kernel supports split-K as a serial reduction bool SplitKSerial> -class Srgemm { public: @@ -422,16 +443,14 @@ class Srgemm::type, ElementA, diff --git a/include/cuasr/gemm/device/srgemm_splitk_parallel.h b/include/cuasr/gemm/device/srgemm_splitk_parallel.h index b88a522..6c24a58 100644 --- a/include/cuasr/gemm/device/srgemm_splitk_parallel.h +++ b/include/cuasr/gemm/device/srgemm_splitk_parallel.h @@ -1,5 +1,32 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************************/ /*! \file \brief Template for GEMM performing a reduction over K partitions in parallel. @@ -34,10 +61,8 @@ namespace device { Gemm device-level operator performing parallel reduction over the K partition. */ template < - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_, + /// Ring operation that performs FMA + typename RingOp_, /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand @@ -59,28 +84,28 @@ template < /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_ = typename DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::ThreadblockShape, + RingOp_, OperatorClass_, ArchTag_>::ThreadblockShape, /// Warp-level tile size (concept: GemmShape) typename WarpShape_ = typename DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::WarpShape, + RingOp_, OperatorClass_, ArchTag_>::WarpShape, /// Instruction-level tile size (concept: GemmShape) typename InstructionShape_ = typename DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::InstructionShape, + RingOp_, OperatorClass_, ArchTag_>::InstructionShape, /// Epilogue output operator typename EpilogueOutputOp_ = typename DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::EpilogueOutputOp, + RingOp_, OperatorClass_, ArchTag_>::EpilogueOutputOp, /// Epilogue conversion operator typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::EpilogueOutputOp::kCount, + RingOp_, OperatorClass_, ArchTag_>::EpilogueOutputOp::kCount, ElementAccumulator_>, /// Reduction operator typename ReductionOp_ = cuasr::reduction::thread::SemiringReduce< - AdditionOp_, ElementAccumulator_, + RingOp_, ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, /// Threadblock-level swizzling operator typename ThreadblockSwizzle_ = @@ -88,21 +113,20 @@ template < /// Number of stages used in the pipelined mainloop int Stages = DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::kStages, + RingOp_, OperatorClass_, ArchTag_>::kStages, /// Access granularity of A matrix in units of elements int kAlignmentA = DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::kAlignmentA, + RingOp_, OperatorClass_, ArchTag_>::kAlignmentA, /// Access granularity of B matrix in units of elements int kAlignmentB = DefaultSemiRingConfiguration< ElementA_, ElementB_, ElementC_, ElementAccumulator_, - OperatorClass_, AdditionOp_, MultiplicationOp_, ArchTag_>::kAlignmentB + RingOp_, OperatorClass_, ArchTag_>::kAlignmentB > class SrgemmSplitKParallel { public: - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + using RingOp = RingOp_; using ElementA = ElementA_; using LayoutA = LayoutA_; using ElementB = ElementB_; @@ -123,6 +147,7 @@ class SrgemmSplitKParallel { /// GEMM kernel using SrgemmKernel = typename cuasr::gemm::kernel::DefaultSrgemmSplitKParallel< + RingOp_, ElementA, LayoutA, kAlignmentA, @@ -137,8 +162,6 @@ class SrgemmSplitKParallel { ThreadblockShape, WarpShape, InstructionShape, - AdditionOp, - MultiplicationOp, ConvertScaledOp, ThreadblockSwizzle, kStages @@ -166,7 +189,6 @@ class SrgemmSplitKParallel { typename EpilogueOutputOp::Params epilogue; int split_k_slices; typename ConvertScaledOp::Params convert; - typename ReductionOp::Params reduction; // // Methods @@ -188,9 +210,7 @@ class SrgemmSplitKParallel { typename EpilogueOutputOp::Params(), int split_k_slices = 1, typename ConvertScaledOp::Params convert_ = - typename ConvertScaledOp::Params(), - typename ReductionOp::Params reduction_ = - typename ReductionOp::Params() + typename ConvertScaledOp::Params() ): problem_size(problem_size_), ref_A(ref_A_), @@ -199,8 +219,7 @@ class SrgemmSplitKParallel { ref_D(ref_D_), epilogue(epilogue_), split_k_slices(split_k_slices), - convert(convert_), - reduction(reduction_) { } + convert(convert_) { } }; private: @@ -381,10 +400,8 @@ class SrgemmSplitKParallel { /// Partial specialization for column-major output template < - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_, + /// Ring operation that performs FMA + typename RingOp_, /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand @@ -422,7 +439,7 @@ template < /// Access granularity of B matrix in units of elements int kAlignmentB> class SrgemmSplitKParallel< - AdditionOp_, MultiplicationOp_, ElementA_, LayoutA_, ElementB_, + RingOp_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, cutlass::layout::ColumnMajor, // partially specialized on LayoutC ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, @@ -430,8 +447,7 @@ class SrgemmSplitKParallel< > { public: - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + using RingOp = RingOp_; using ElementA = ElementA_; using LayoutA = LayoutA_; using ElementB = ElementB_; @@ -451,8 +467,7 @@ class SrgemmSplitKParallel< static int const kStages = Stages; using UnderlyingOperator = SrgemmSplitKParallel< - AdditionOp, - MultiplicationOp, + RingOp, ElementB, typename cutlass::layout::LayoutTranspose::type, ElementA, @@ -493,7 +508,6 @@ class SrgemmSplitKParallel< typename EpilogueOutputOp::Params epilogue; int split_k_slices; typename ConvertScaledOp::Params convert; - typename ReductionOp::Params reduction; // // Methods @@ -515,9 +529,7 @@ class SrgemmSplitKParallel< typename EpilogueOutputOp::Params(), int split_k_slices = 1, typename ConvertScaledOp::Params convert_ = - typename ConvertScaledOp::Params(), - typename ReductionOp::Params reduction_ = - typename ReductionOp::Params() + typename ConvertScaledOp::Params() ): problem_size(problem_size_), ref_A(ref_A_), @@ -526,8 +538,7 @@ class SrgemmSplitKParallel< ref_D(ref_D_), epilogue(epilogue_), split_k_slices(split_k_slices), - convert(convert_), - reduction(reduction_) { } + convert(convert_) { } }; private: diff --git a/include/cuasr/gemm/epilogue/thread/semiring_linear_combination.h b/include/cuasr/gemm/epilogue/thread/semiring_linear_combination.h index 04465a6..d52d614 100644 --- a/include/cuasr/gemm/epilogue/thread/semiring_linear_combination.h +++ b/include/cuasr/gemm/epilogue/thread/semiring_linear_combination.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Functor performing linear combination operations used by epilogues. @@ -26,8 +54,7 @@ namespace thread { /// D = alpha * accumulator + beta * source + uniform /// template < - typename AdditionOp_, ///< Addition reel of this semi-ring - typename MultiplicationOp_, ///< Addition reel of this semi-ring + typename RingOp_, ///< Ring operator that exposes .add and .mult methods typename ElementOutput_, ///< Data type used to load and store tensors int Count, ///< Number of elements computed per operation typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type @@ -36,13 +63,12 @@ template < cutlass::FloatRoundStyle Round = cutlass::FloatRoundStyle::round_to_nearest> class SemiringLinearCombination { public: - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + using RingOp = RingOp_; using ElementOutput = ElementOutput_; using ElementAccumulator = ElementAccumulator_; using ElementCompute = ElementCompute_; - static int const kCount = Count; + static int const kCount = Count; using FragmentOutput = cutlass::Array; using FragmentAccumulator = cutlass::Array; @@ -65,8 +91,8 @@ class SemiringLinearCombination { CUTLASS_HOST_DEVICE Params() - : alpha(MultiplicationOp::Identity) - , beta(MultiplicationOp::Annihilator) + : alpha(RingOp::MultIdentity) + , beta(RingOp::MultAnnihilator) , alpha_ptr(nullptr) , beta_ptr(nullptr) { } @@ -80,21 +106,21 @@ class SemiringLinearCombination { CUTLASS_HOST_DEVICE Params(ElementCompute alpha) : alpha(alpha) - , beta(MultiplicationOp::Annihilator) + , beta(RingOp::MultAnnihilator) , alpha_ptr(nullptr) , beta_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr) - : alpha(MultiplicationOp::Identity) - , beta(MultiplicationOp::Annihilator) + : alpha(RingOp::MultIdentity) + , beta(RingOp::MultAnnihilator) , alpha_ptr(alpha_ptr) , beta_ptr(beta_ptr) { } CUTLASS_HOST_DEVICE Params(ElementCompute const *alpha_ptr) - : alpha(MultiplicationOp::Identity) - , beta(MultiplicationOp::Annihilator) + : alpha(RingOp::MultIdentity) + , beta(RingOp::MultAnnihilator) , alpha_ptr(alpha_ptr) , beta_ptr(nullptr) { } }; @@ -103,8 +129,7 @@ class SemiringLinearCombination { // scalars ElementCompute alpha_; ElementCompute beta_; - AdditionOp add_op_; - MultiplicationOp mult_op_; + RingOp ring_op_; public: /// Constructs the function object, possibly loading from pointers in host memory @@ -117,24 +142,24 @@ class SemiringLinearCombination { /// Returns true if source is needed CUTLASS_HOST_DEVICE bool is_source_needed() const { - ElementCompute kAdditiveIdentity = AdditionOp::Identity; - ElementCompute kMultiplicativeIdentity = MultiplicationOp::Identity; + ElementCompute kAdditiveIdentity = RingOp::AddIdentity; + ElementCompute kMultiplicativeIdentity = RingOp::MultIdentity; // no source needed if mult_op(beta, C[i,j]) is equal to add_op's identity - return (kAdditiveIdentity != mult_op_(beta_, kMultiplicativeIdentity)); + return (kAdditiveIdentity != ring_op_.mult(beta_, kMultiplicativeIdentity)); } /// Functionally required for serial reduction in the epilogue CUTLASS_HOST_DEVICE void set_k_partition(int k_partition) { if (k_partition) { - ElementCompute kMultiplicativeIdentity = MultiplicationOp::Identity; + ElementCompute kMultiplicativeIdentity = RingOp::MultIdentity; beta_ = kMultiplicativeIdentity; } } /// Computes semiring linear scale and translate - /// D = add_op_(mult_op_(alpha * accumulator), mult_op_(beta * source)) + /// D = ring_op_.add(ring_op_.mult(alpha * accumulator), ring_op_.mult(beta * source)) CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator, FragmentOutput const &source) const { @@ -149,10 +174,17 @@ class SemiringLinearCombination { // Perform binary operations // X = beta * C - ComputeFragment intermediate = mult_op_(beta_, converted_source); + ComputeFragment intermediate; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + intermediate[i] = ring_op_.mult(beta_, converted_source[i]); + } // D = (alpha * Accum) + X - intermediate = add_op_(mult_op_(alpha_, converted_accumulator), intermediate); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + intermediate[i] = ring_op_.add(ring_op_.mult(alpha_, converted_accumulator[i]), intermediate[i]); + } // Convert to destination numeric type cutlass::NumericArrayConverter @@ -161,7 +193,7 @@ class SemiringLinearCombination { return destination_converter(intermediate); } - /// Computes semiring linear scaling: D = mult_op_(alpha, accumulator) + /// Computes semiring linear scaling: D = ring_op_.mult(alpha, accumulator) CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator) const { // Convert source to internal compute numeric type @@ -173,7 +205,10 @@ class SemiringLinearCombination { // Perform binary operations ComputeFragment intermediate; - intermediate = mult_op_(alpha_, converted_accumulator); // D = alpha * Accum + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + intermediate[i] = ring_op_.mult(alpha_, converted_accumulator[i]); // D = alpha * Accum + } // Convert to destination numeric type cutlass::NumericArrayConverter diff --git a/include/cuasr/gemm/kernel/default_srgemm.h b/include/cuasr/gemm/kernel/default_srgemm.h index 561b653..50be26e 100644 --- a/include/cuasr/gemm/kernel/default_srgemm.h +++ b/include/cuasr/gemm/kernel/default_srgemm.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief @@ -39,6 +67,8 @@ namespace kernel { //////////////////////////////////////////////////////////////////////////////// template < + /// Ring operation that performs FMA + typename RingOp_, /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand @@ -67,10 +97,6 @@ template < typename WarpShape, /// Instruction-level tile size (concept: GemmShape) typename InstructionShape, - /// Addition operator of the semi-ring - typename AdditionOp, - /// Multiplication operator of the semi-ring - typename MultiplicationOp, /// Epilogue output operator typename EpilogueOutputOp, /// Threadblock-level swizzling operator @@ -79,10 +105,18 @@ template < int Stages, /// If true, kernel is configured to support serial reduction in the /// epilogue - bool SplitKSerial> + bool SplitKSerial, + /// Use zfill or predicate for out-of-bound cp.async + cutlass::gemm::SharedMemoryClearOption SharedMemoryClear + = cutlass::gemm::SharedMemoryClearOption::kNone> struct DefaultSrgemm; +//////////////////////////////////////////////////////////////////////////////// + +// SM50 SIMT Two Stage template < + /// Ring operation that performs FMA + typename RingOp, /// Element type for A matrix operand typename ElementA, /// Layout type for A matrix operand @@ -99,16 +133,10 @@ template < typename ElementC, /// Element type for internal accumulation typename ElementAccumulator, - /// Tag indicating architecture to tune for - typename ArchTag, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape, /// Warp-level tile size (concept: GemmShape) typename WarpShape, - /// Addition operator of the semi-ring - typename AdditionOp, - /// Multiplication operator of the semi-ring - typename MultiplicationOp, /// Epilogue output operator typename EpilogueOutputOp, /// Threadblock-level swizzling operator @@ -117,6 +145,7 @@ template < bool SplitKSerial > struct DefaultSrgemm< + RingOp, ElementA, LayoutA, kAlignmentA, @@ -127,18 +156,17 @@ struct DefaultSrgemm< cutlass::layout::RowMajor, ElementAccumulator, cutlass::arch::OpClassSimt, - ArchTag, + cutlass::arch::Sm50, ThreadblockShape, WarpShape, cutlass::gemm::GemmShape<1, 1, 1>, - AdditionOp, - MultiplicationOp, EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial> { /// Define the threadblock-scoped matrix multiply-accumulate using Srmma = typename cuasr::gemm::threadblock::DefaultSrmma< + RingOp, ElementA, LayoutA, kAlignmentA, @@ -152,8 +180,6 @@ struct DefaultSrgemm< ThreadblockShape, WarpShape, cutlass::gemm::GemmShape<1, 1, 1>, - AdditionOp, - MultiplicationOp, 2>::ThreadblockSrmma; static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount; @@ -170,8 +196,101 @@ struct DefaultSrgemm< /// Define the kernel-level GEMM operator. using SrgemmKernel = cuasr::gemm::kernel::Srgemm< Srmma, - AdditionOp, - MultiplicationOp, + RingOp, + Epilogue, + ThreadblockSwizzle, + SplitKSerial + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +// SM80 SIMT Multi Stage +template < + /// Ring operation that performs FMA + typename RingOp, + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of A matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial +> +struct DefaultSrgemm< + RingOp, + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementC, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + ThreadblockShape, + WarpShape, + cutlass::gemm::GemmShape<1, 1, 1>, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + SplitKSerial> { + /// Define the threadblock-scoped matrix multiply-accumulate + using Srmma = typename cuasr::gemm::threadblock::DefaultSrmma< + RingOp, + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementAccumulator, + cutlass::layout::RowMajor, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + ThreadblockShape, + WarpShape, + cutlass::gemm::GemmShape<1, 1, 1>, + Stages>::ThreadblockSrmma; + + static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount; + static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars"); + + /// Define the epilogue + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + typename Srmma::Operator, + EpilogueOutputOp, + kEpilogueElementsPerAccess + >::Epilogue; + + /// Define the kernel-level GEMM operator. + using SrgemmKernel = cuasr::gemm::kernel::Srgemm< + Srmma, + RingOp, Epilogue, ThreadblockSwizzle, SplitKSerial diff --git a/include/cuasr/gemm/kernel/default_srgemm_splitk_parallel.h b/include/cuasr/gemm/kernel/default_srgemm_splitk_parallel.h index 0d1650c..77d8ec8 100644 --- a/include/cuasr/gemm/kernel/default_srgemm_splitk_parallel.h +++ b/include/cuasr/gemm/kernel/default_srgemm_splitk_parallel.h @@ -1,7 +1,34 @@ /*************************************************************************************************** - + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ - /*! \file \brief Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with @@ -28,6 +55,8 @@ namespace kernel { //////////////////////////////////////////////////////////////////////////////// template < + /// Ring operation that performs FMA + typename RingOp_, /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand @@ -56,10 +85,6 @@ template < typename WarpShape, /// Warp-level tile size (concept: GemmShape) typename InstructionShape, - /// Addition operator of the semi-ring - typename AdditionOp, - /// Multiplication operator of the semi-ring - typename MultiplicationOp, /// Epilogue output operator typename EpilogueOutputOp, /// Threadblock-level swizzling operator @@ -68,10 +93,10 @@ template < int Stages > struct DefaultSrgemmSplitKParallel { - // Define threadblock-scoped split-K matrix multiply using // the basic SRGEMM's kernel level main loop using Default = DefaultSrgemm< + RingOp_, ElementA_, LayoutA_, kAlignmentA, @@ -86,8 +111,6 @@ struct DefaultSrgemmSplitKParallel { ThreadblockShape, WarpShape, InstructionShape, - AdditionOp, - MultiplicationOp, EpilogueOutputOp, ThreadblockSwizzle, Stages, @@ -100,11 +123,13 @@ struct DefaultSrgemmSplitKParallel { /// Define the epilogue using Epilogue = typename Default::Epilogue; + /// Ring operation that performs FMA + using RingOp = RingOp_; + /// Define the kernel-level GEMM operator. using SrgemmKernel = kernel::SrgemmSplitKParallel< Srmma, - AdditionOp, - MultiplicationOp, + RingOp, Epilogue, ThreadblockSwizzle >; diff --git a/include/cuasr/gemm/kernel/srgemm.h b/include/cuasr/gemm/kernel/srgemm.h index 816598d..ff91f7c 100644 --- a/include/cuasr/gemm/kernel/srgemm.h +++ b/include/cuasr/gemm/kernel/srgemm.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Template for a pipelined Semiring GEMM kernel. Does not compute batching or support split-K. @@ -26,8 +54,7 @@ namespace kernel { // SemiRing Gemm kernel that support custom thread level MMA and init values. template < typename Srmma_, ///! Threadblock-scoped matrix multiply-accumulate - typename AdditionOp_, ///! Addition operator of the semi-ring - typename MultiplicationOp_, ///! Multiplication operator of the semi-ring + typename RingOp_, ///! Ring operation that performs FMA typename Epilogue_, ///! Epilogue typename ThreadblockSwizzle_, ///! Threadblock swizzling function bool SplitKSerial ///! If true, code supporting split-K via serial reduction is enabled. @@ -38,8 +65,7 @@ struct Srgemm { using Epilogue = Epilogue_; using OutputOp = typename Epilogue::OutputOp; using ThreadblockSwizzle = ThreadblockSwizzle_; - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + using RingOp = RingOp_; static bool const kSplitKSerial = SplitKSerial; /// Warp count (concept: GemmShape) @@ -155,7 +181,7 @@ struct Srgemm { /// Executes one GEMM CUTLASS_DEVICE void operator()(Params const ¶ms, SharedStorage &shared_storage) { - constexpr typename OutputOp::ElementCompute kAdditiveIdentity = AdditionOp::Identity; + constexpr typename OutputOp::ElementCompute kAdditiveIdentity = RingOp::AddIdentity; // Compute threadblock location ThreadblockSwizzle threadblock_swizzle; diff --git a/include/cuasr/gemm/kernel/srgemm_splitk_parallel.h b/include/cuasr/gemm/kernel/srgemm_splitk_parallel.h index a44a475..b517f9c 100644 --- a/include/cuasr/gemm/kernel/srgemm_splitk_parallel.h +++ b/include/cuasr/gemm/kernel/srgemm_splitk_parallel.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Template for 3D SRGEMM performing a reduction over K partitions in parallel. @@ -22,16 +50,14 @@ namespace kernel { template < typename Srmma_, ///! Threadblock-scoped matrix multiply-accumulate - typename AdditionOp_, ///! Addition operator of the semi-ring - typename MultiplicationOp_, ///! Multiplication operator of the semi-ring + typename RingOp_, ///! Ring operation that performs FMA typename Epilogue_, ///! Epilogue typename ThreadblockSwizzle_ ///! Threadblock swizzling function > struct SrgemmSplitKParallel { using Srmma = Srmma_; - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + using RingOp = RingOp_; using Epilogue = Epilogue_; using OutputOp = typename Epilogue::OutputOp; using ThreadblockSwizzle = ThreadblockSwizzle_; @@ -106,7 +132,7 @@ struct SrgemmSplitKParallel { /// Executes one GEMM CUTLASS_DEVICE void operator()(Params const ¶ms, SharedStorage &shared_storage) { - constexpr typename OutputOp::ElementCompute kAdditiveIdentity = AdditionOp::Identity; + constexpr typename OutputOp::ElementCompute kAdditiveIdentity = RingOp::AddIdentity; // Compute threadblock location ThreadblockSwizzle threadblock_swizzle; diff --git a/include/cuasr/gemm/thread/srmma.h b/include/cuasr/gemm/thread/srmma.h index bcad6fb..f7d6971 100644 --- a/include/cuasr/gemm/thread/srmma.h +++ b/include/cuasr/gemm/thread/srmma.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Templates exposing architecture support for warp-level multiply-add operations @@ -39,12 +67,8 @@ template < typename ElementC, /// Layout of C matrix (concept: MatrixLayout) typename LayoutC, - /// Addition operator of the semi-ring - typename AdditionOp, - /// Multiplication operator of the semi-ring - typename MultiplicationOp, - /// Used for partial specialization - typename Enable = bool + /// Ring operation that performs FMA + typename RingOp > struct Srmma; diff --git a/include/cuasr/gemm/thread/srmma_sm50.h b/include/cuasr/gemm/thread/srmma_sm50.h index 27ea407..fddd9b3 100644 --- a/include/cuasr/gemm/thread/srmma_sm50.h +++ b/include/cuasr/gemm/thread/srmma_sm50.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Templates exposing architecture support for multiply-add operations @@ -24,7 +52,7 @@ namespace thread { /// Gemplate that handles all packed matrix layouts template < - /// Size of the Gemm problem - concept: cutlass::gemm::GemmShape<> + /// Size of the Gemm problem - concept: gemm::GemmShape<> typename Shape_, /// Data type of A elements typename ElementA_, @@ -38,14 +66,12 @@ template < typename ElementC_, /// Layout of C matrix (concept: layout::MapFunc) typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_ + /// Ring operation that performs FMA + typename RingOp_ > struct SrmmaGeneric { - /// Size of the Gemm problem - concept: cutlass::gemm::GemmShape<> + /// Size of the Gemm problem - concept: gemm::GemmShape<> using Shape = Shape_; /// Data type of operand A @@ -66,9 +92,8 @@ struct SrmmaGeneric { /// Layout of C matrix (concept: layout::MapFunc) using LayoutC = LayoutC_; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// A operand storage using FragmentA = cutlass::Array; @@ -80,19 +105,15 @@ struct SrmmaGeneric { using FragmentC = cutlass::Array; /// Instruction - using SrmmaOp = arch::Srmma< - cutlass::gemm::GemmShape<1,1,1>, - 1, - ElementA, LayoutA, - ElementB, LayoutB, - ElementC, LayoutC, - AdditionOp, MultiplicationOp>; + using SrmmaOp = RingOp; + + static bool const kMultipleOf2 = ((Shape::kM % 2 == 0) && (Shape::kN % 2 == 0)); // // Methods // - /// Computes a generalized matrix product on any semi-ring + /// Computes a matrix product D = A * B + C CUTLASS_HOST_DEVICE void operator()( FragmentC & D, @@ -107,7 +128,7 @@ struct SrmmaGeneric { reinterpret_cast(&B), LayoutB::packed({Shape::kK, Shape::kN})); cutlass::TensorRef d_ref( - reinterpret_cast(&D), LayoutC::packed({ Shape::kM, Shape::kN })); + reinterpret_cast(&D), LayoutC::packed(cutlass::make_Coord(Shape::kM, Shape::kN))); SrmmaOp srmma_op; @@ -117,42 +138,80 @@ struct SrmmaGeneric { // Compute matrix product CUTLASS_PRAGMA_UNROLL for (int k = 0; k < Shape::kK; ++k) { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 860) + if (kMultipleOf2 && cutlass::platform::is_same::value + && cutlass::platform::is_same::value + && cutlass::platform::is_same::value) { - CUTLASS_PRAGMA_UNROLL - for (int n = 0; n < Shape::kN; ++n) { - + //2x2 zigzag - m and n loops to increment by 2. Inner loop to process 4 multiply-adds in a 2x2 tile. CUTLASS_PRAGMA_UNROLL - for (int m = 0; m < Shape::kM; ++m) { - - int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m; - - cutlass::MatrixCoord mn(m_serpentine, n); - cutlass::MatrixCoord mk(m_serpentine, k); - cutlass::MatrixCoord kn(k, n); - - cutlass::Array d; - cutlass::Array a; - cutlass::Array b; + for (int n = 0; n < Shape::kN; n+=2) { + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < Shape::kM; m+=2) { + + int m_serpentine = (n % 4) ? (Shape::kM - 2 - m) : m; + + //top-left element in 2x2 tile + { + cutlass::MatrixCoord mn(m_serpentine, n); + cutlass::MatrixCoord mk(m_serpentine, k); + cutlass::MatrixCoord kn(k, n); + srmma_op.fma(d_ref.at(mn), a_ref.at(mk), b_ref.at(kn), d_ref.at(mn)); + } + + //bottom-left element in 2x2 tile + { + cutlass::MatrixCoord mn(m_serpentine+1, n); + cutlass::MatrixCoord mk(m_serpentine+1, k); + cutlass::MatrixCoord kn(k, n); + srmma_op.fma(d_ref.at(mn), a_ref.at(mk), b_ref.at(kn), d_ref.at(mn)); + } + + //bottom-right element in 2x2 tile + { + cutlass::MatrixCoord mn(m_serpentine+1, n+1); + cutlass::MatrixCoord mk(m_serpentine+1, k); + cutlass::MatrixCoord kn(k, n+1); + srmma_op.fma(d_ref.at(mn), a_ref.at(mk), b_ref.at(kn), d_ref.at(mn)); + } + + //top-right element in 2x2 tile + { + cutlass::MatrixCoord mn(m_serpentine, n+1); + cutlass::MatrixCoord mk(m_serpentine, k); + cutlass::MatrixCoord kn(k, n+1); + srmma_op.fma(d_ref.at(mn), a_ref.at(mk), b_ref.at(kn), d_ref.at(mn)); + } + } + } + } else + #endif + { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Shape::kN; ++n) { - d[0] = d_ref.at(mn); - a[0] = a_ref.at(mk); - b[0] = b_ref.at(kn); + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < Shape::kM; ++m) { - srmma_op(d, a, b, d); + int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m; - d_ref.at(mn) = d[0]; + cutlass::MatrixCoord mn(m_serpentine, n); + cutlass::MatrixCoord mk(m_serpentine, k); + cutlass::MatrixCoord kn(k, n); + srmma_op.fma(d_ref.at(mn), a_ref.at(mk), b_ref.at(kn), d_ref.at(mn)); + } } } } } }; - ///////////////////////////////////////////////////////////////////////////////////////////////// /// Gemplate that handles conventional layouts for FFMA and DFMA GEMM template < - /// Size of the Gemm problem - concept: cutlass::gemm::GemmShape<> + /// Size of the Gemm problem - concept: gemm::GemmShape<> typename Shape_, /// Data type of A elements typename ElementA_, @@ -166,25 +225,12 @@ template < typename ElementC_, /// Layout of C matrix (concept: layout::MapFunc) typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_ + /// Ring operation that performs FMA + typename RingOp_ > -struct Srmma< - Shape_, - ElementA_, - LayoutA_, - ElementB_, - LayoutB_, - ElementC_, - LayoutC_, - AdditionOp_, - MultiplicationOp_, - bool -> { - - /// Size of the Gemm problem - concept: cutlass::gemm::GemmShape<> +struct Srmma { + + /// Size of the Gemm problem - concept: gemm::GemmShape<> using Shape = Shape_; /// Data type of operand A @@ -205,9 +251,8 @@ struct Srmma< /// Layout of C matrix (concept: layout::MapFunc) using LayoutC = LayoutC_; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Ring operation that performs FMA + using RingOp = RingOp_; /// A operand storage using FragmentA = Array; @@ -222,7 +267,7 @@ struct Srmma< // Methods // - /// Computes a matrix product for any semi-ring + /// Computes a matrix product D = A * B + C CUTLASS_HOST_DEVICE void operator()( FragmentC & D, @@ -238,8 +283,7 @@ struct Srmma< LayoutB, ElementC, LayoutC, - AdditionOp, - MultiplicationOp> srmma; + RingOp> srmma; srmma(D, A, B, C); } diff --git a/include/cuasr/gemm/threadblock/default_srmma.h b/include/cuasr/gemm/threadblock/default_srmma.h index 2435e65..8b59b4a 100644 --- a/include/cuasr/gemm/threadblock/default_srmma.h +++ b/include/cuasr/gemm/threadblock/default_srmma.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K. @@ -27,6 +55,8 @@ namespace threadblock { //////////////////////////////////////////////////////////////////////////////// template < + /// Ring operation that performs FMA + typename RingOp, /// Element type for A matrix operand typename ElementA_, /// Layout type for A matrix operand @@ -53,10 +83,6 @@ template < typename WarpShape_, /// Instruction-level tile size (concept: GemmShape) typename InstructionShape_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_, /// Number of stages used in the pipelined mainloop int Stages, /// Store the accumulators in row major or column major. @@ -69,6 +95,8 @@ struct DefaultSrmma; /// Specialization for row-major output (OperatorClass Simt) template < + /// Ring operation that performs FMA + typename RingOp, /// Element type for A matrix operand typename ElementA, /// Layout type for A matrix operand @@ -83,27 +111,21 @@ template < int kAlignmentB, /// Element type for internal accumulation typename ElementAccumulator, - /// Tag indicating architecture to tune for - typename ArchTag, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape, /// Warp-level tile size (concept: GemmShape) typename WarpShape, /// Instruction-level tile size (concept: GemmShape) - typename InstructionShape, - /// Addition operator of the semi-ring - typename AdditionOp, - /// Multiplication operator of the semi-ring - typename MultiplicationOp> -struct DefaultSrmma +struct DefaultSrmma { + cutlass::arch::OpClassSimt, cutlass::arch::Sm50, ThreadblockShape, WarpShape, + InstructionShape, 2, false> { // Define the SrmmaCore components using SrmmaCore = typename cuasr::gemm::threadblock::DefaultSrmmaCore< ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator, cutlass::layout::RowMajor, - cutlass::arch::OpClassSimt, AdditionOp, MultiplicationOp, 2>; + cutlass::arch::OpClassSimt, RingOp, 2>; // Define iterators over tiles from the A operand using IteratorA = @@ -119,11 +141,90 @@ struct DefaultSrmma; + typename SrmmaCore::Shape, + IteratorA, typename SrmmaCore::SmemIteratorA, + IteratorB, typename SrmmaCore::SmemIteratorB, + ElementAccumulator, cutlass::layout::RowMajor, + typename SrmmaCore::MmaPolicy>; }; +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for row-major output multi-stage (OperatorClass Simt) +template < + /// Ring operation that performs FMA + typename RingOp, + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the multistage mainloop + int Stages> +struct DefaultSrmma { + + static cutlass::arch::CacheOperation::Kind const CacheOpA = + ((cutlass::sizeof_bits::value * kAlignmentA) == 128) + ? cutlass::arch::CacheOperation::Global + : cutlass::arch::CacheOperation::Always; + + static cutlass::arch::CacheOperation::Kind const CacheOpB = + ((cutlass::sizeof_bits::value * kAlignmentB) == 128) + ? cutlass::arch::CacheOperation::Global + : cutlass::arch::CacheOperation::Always; + + // Define the SrmmaCore components + using SrmmaCore = typename cuasr::gemm::threadblock::DefaultSrmmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, cutlass::layout::RowMajor, + cutlass::arch::OpClassSimt, RingOp, Stages, CacheOpA, CacheOpB>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename SrmmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename SrmmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped pipelined matrix multiply + using ThreadblockSrmma = cuasr::gemm::threadblock::SrmmaMultistage< + typename SrmmaCore::Shape, + IteratorA, typename SrmmaCore::SmemIteratorA, SrmmaCore::kCacheOpA, + IteratorB, typename SrmmaCore::SmemIteratorB, SrmmaCore::kCacheOpB, + ElementAccumulator, cutlass::layout::RowMajor, + typename SrmmaCore::MmaPolicy, Stages>; +}; + +//////////////////////////////////////////////////////////////////////////////// } // namespace threadblock } // namespace gemm diff --git a/include/cuasr/gemm/threadblock/default_srmma_core.h b/include/cuasr/gemm/threadblock/default_srmma_core.h index 3f69212..2c31ea6 100644 --- a/include/cuasr/gemm/threadblock/default_srmma_core.h +++ b/include/cuasr/gemm/threadblock/default_srmma_core.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data @@ -20,6 +48,7 @@ #include "cutlass/gemm/warp/mma.h" #include "cuasr/gemm/threadblock/srmma_pipelined.h" +#include "cuasr/gemm/threadblock/srmma_multistage.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -52,10 +81,8 @@ template < typename LayoutC, /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp) typename OperatorClass, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_, + /// Ring operation that performs FMA + typename RingOp, /// Number of stages int Stages = 2, /// Store the accumulators in row major or column major. @@ -82,3 +109,4 @@ struct DefaultSrmmaCore; } // namespace cuasr #include "cuasr/gemm/threadblock/default_srmma_core_simt.h" +#include "cuasr/gemm/threadblock/default_srmma_core_sm80.h" diff --git a/include/cuasr/gemm/threadblock/default_srmma_core_simt.h b/include/cuasr/gemm/threadblock/default_srmma_core_simt.h index 211f05c..f4602c7 100644 --- a/include/cuasr/gemm/threadblock/default_srmma_core_simt.h +++ b/include/cuasr/gemm/threadblock/default_srmma_core_simt.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data @@ -74,14 +102,12 @@ template < typename ElementC_, /// Layout of accumulator typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_> + /// Ring operation that performs FMA + typename RingOp_> struct DefaultSrmmaCore, ElementA_, cutlass::layout::ColumnMajor, ElementB_, cutlass::layout::RowMajor, ElementC_, LayoutC_, cutlass::arch::OpClassSimt, - AdditionOp_, MultiplicationOp_, 2 + RingOp_, 2 > { using Shape = Shape_; using WarpShape = WarpShape_; @@ -95,9 +121,8 @@ struct DefaultSrmmaCore, E using OperatorClass = cutlass::arch::OpClassSimt; static int const PartitionsK = Shape::kK / WarpShape::kK; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// Number of warps present using WarpCount = cutlass::gemm::GemmShape< @@ -200,8 +225,7 @@ struct DefaultSrmmaCore, E ElementC, /// Element type of C matrix LayoutC, /// Layout of C matrix (concept: MatrixLayout) Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) - AdditionOp, /// Addition operator of the semi-ring - MultiplicationOp /// Multiplication operator of the semi-ring + RingOp >; /// Policy used to define MmaPipelined @@ -236,14 +260,12 @@ template < typename ElementC_, /// Layout of accumulator typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_> + /// Ring operation that performs FMA + typename RingOp_> struct DefaultSrmmaCore, ElementA_, cutlass::layout::RowMajor, ElementB_, cutlass::layout::ColumnMajor, ElementC_, LayoutC_, cutlass::arch::OpClassSimt, - AdditionOp_, MultiplicationOp_, 2 + RingOp_, 2 > { using Shape = Shape_; using WarpShape = WarpShape_; @@ -257,9 +279,8 @@ struct DefaultSrmmaCore, E using OperatorClass = cutlass::arch::OpClassSimt; static int const PartitionsK = Shape::kK / WarpShape::kK; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// Number of warps present using WarpCount = cutlass::gemm::GemmShape< @@ -372,8 +393,7 @@ struct DefaultSrmmaCore, E ElementC, /// Element type of C matrix LayoutC, /// Layout of C matrix (concept: MatrixLayout) Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) - AdditionOp, /// Addition operator of the semi-ring - MultiplicationOp /// Multiplication operator of the semi-ring + RingOp >; @@ -409,13 +429,11 @@ template < typename ElementC_, /// Layout of accumulator typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_> + /// Ring operation that performs FMA + typename RingOp_> struct DefaultSrmmaCore, ElementA_, cutlass::layout::RowMajor, ElementB_, cutlass::layout::RowMajor, ElementC_, - LayoutC_, cutlass::arch::OpClassSimt, AdditionOp_, MultiplicationOp_, 2 + LayoutC_, cutlass::arch::OpClassSimt, RingOp_, 2 > { using Shape = Shape_; using WarpShape = WarpShape_; @@ -429,9 +447,8 @@ struct DefaultSrmmaCore, E using OperatorClass = cutlass::arch::OpClassSimt; static int const PartitionsK = Shape::kK / WarpShape::kK; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// Number of warps present using WarpCount = cutlass::gemm::GemmShape< @@ -540,8 +557,7 @@ struct DefaultSrmmaCore, E ElementC, /// Element type of C matrix LayoutC, /// Layout of C matrix (concept: MatrixLayout) Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) - AdditionOp, /// Addition operator of the semi-ring - MultiplicationOp /// Multiplication operator of the semi-ring + RingOp >; /// Policy used to define MmaPipelined @@ -576,14 +592,12 @@ template < typename ElementC_, /// Layout of accumulator typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_> + /// Ring operation that performs FMA + typename RingOp_> struct DefaultSrmmaCore, ElementA_, cutlass::layout::ColumnMajor, ElementB_, cutlass::layout::ColumnMajor, ElementC_, LayoutC_, cutlass::arch::OpClassSimt, - AdditionOp_, MultiplicationOp_, 2 + RingOp_, 2 > { using Shape = Shape_; using WarpShape = WarpShape_; @@ -597,9 +611,8 @@ struct DefaultSrmmaCore, E using OperatorClass = cutlass::arch::OpClassSimt; static int const PartitionsK = Shape::kK / WarpShape::kK; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// Number of warps present using WarpCount = cutlass::gemm::GemmShape< @@ -708,8 +721,7 @@ struct DefaultSrmmaCore, E ElementC, /// Element type of C matrix LayoutC, /// Layout of C matrix (concept: MatrixLayout) Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) - AdditionOp, /// Addition operator of the semi-ring - MultiplicationOp /// Multiplication operator of the semi-ring + RingOp >; /// Policy used to define MmaPipelined @@ -740,13 +752,11 @@ template < typename ElementC_, /// Layout of accumulator typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_> + /// Ring operation that performs FMA + typename RingOp_> struct DefaultSrmmaCore, int8_t, cutlass::layout::ColumnMajor, int8_t, cutlass::layout::RowMajor, ElementC_, - LayoutC_, cutlass::arch::OpClassSimt, AdditionOp_, MultiplicationOp_, 2 + LayoutC_, cutlass::arch::OpClassSimt, RingOp_, 2 > { using Shape = Shape_; @@ -761,9 +771,8 @@ struct DefaultSrmmaCore, i using OperatorClass = cutlass::arch::OpClassSimt; static int const PartitionsK = Shape::kK / WarpShape::kK; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// Number of warps present using WarpCount = cutlass::gemm::GemmShape< @@ -866,8 +875,7 @@ struct DefaultSrmmaCore, i ElementC, /// Element type of C matrix LayoutC, /// Layout of C matrix (concept: MatrixLayout) Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) - AdditionOp, /// Addition operator of the semi-ring - MultiplicationOp, /// Multiplication operator of the semi-ring + RingOp, PartitionsK /// Number of partitions along K dimension >; @@ -899,13 +907,11 @@ template < typename ElementC_, /// Layout of accumulator typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_> + /// Ring operation that performs FMA + typename RingOp_> struct DefaultSrmmaCore, int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, ElementC_, - LayoutC_, cutlass::arch::OpClassSimt, AdditionOp_, MultiplicationOp_, 2 + LayoutC_, cutlass::arch::OpClassSimt, RingOp_, 2 > { using Shape = Shape_; @@ -920,10 +926,8 @@ struct DefaultSrmmaCore, i using OperatorClass = cutlass::arch::OpClassSimt; static int const PartitionsK = Shape::kK / WarpShape::kK; - - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// Number of warps present using WarpCount = cutlass::gemm::GemmShape< @@ -1032,8 +1036,7 @@ struct DefaultSrmmaCore, i ElementC, /// Element type of C matrix LayoutC, /// Layout of C matrix (concept: MatrixLayout) Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) - AdditionOp, /// Addition operator of the semi-ring - MultiplicationOp, /// Multiplication operator of the semi-ring + RingOp, PartitionsK /// Number of partitions along K dimension >; @@ -1068,13 +1071,11 @@ template < typename ElementC_, /// Layout of accumulator typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_> + /// Ring operation that performs FMA + typename RingOp_> struct DefaultSrmmaCore, int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::RowMajor, ElementC_, - LayoutC_, cutlass::arch::OpClassSimt, AdditionOp_, MultiplicationOp_, 2 + LayoutC_, cutlass::arch::OpClassSimt, RingOp_, 2 > { using Shape = Shape_; @@ -1089,10 +1090,8 @@ struct DefaultSrmmaCore, i using OperatorClass = cutlass::arch::OpClassSimt; static int const PartitionsK = Shape::kK / WarpShape::kK; - - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// Number of warps present using WarpCount = cutlass::gemm::GemmShape< @@ -1197,8 +1196,7 @@ struct DefaultSrmmaCore, i ElementC, /// Element type of C matrix LayoutC, /// Layout of C matrix (concept: MatrixLayout) Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) - AdditionOp, /// Addition operator of the semi-ring - MultiplicationOp, /// Multiplication operator of the semi-ring + RingOp, PartitionsK /// Number of partitions along K dimension >; @@ -1233,13 +1231,11 @@ template < typename ElementC_, /// Layout of accumulator typename LayoutC_, - /// Addition operator of the semi-ring - typename AdditionOp_, - /// Multiplication operator of the semi-ring - typename MultiplicationOp_> + /// Ring operation that performs FMA + typename RingOp_> struct DefaultSrmmaCore, int8_t, cutlass::layout::ColumnMajor, int8_t, cutlass::layout::ColumnMajor, ElementC_, - LayoutC_, cutlass::arch::OpClassSimt, AdditionOp_, MultiplicationOp_, 2 + LayoutC_, cutlass::arch::OpClassSimt, RingOp_, 2 > { using Shape = Shape_; @@ -1254,9 +1250,8 @@ struct DefaultSrmmaCore, i using OperatorClass = cutlass::arch::OpClassSimt; static int const PartitionsK = Shape::kK / WarpShape::kK; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Underlying semi-ring operator + using RingOp = RingOp_; /// Number of warps present using WarpCount = cutlass::gemm::GemmShape< @@ -1305,7 +1300,6 @@ struct DefaultSrmmaCore, i IteratorThreadMapA >; - /// Policy of iterator B using IteratorThreadMapB = cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< cutlass::layout::PitchLinearShape, @@ -1353,7 +1347,6 @@ struct DefaultSrmmaCore, i LaneMmaShape >; - using MmaWarpSimt = cuasr::gemm::warp::SrmmaSimt< WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 ElementA, /// Data type of A elements @@ -1363,8 +1356,7 @@ struct DefaultSrmmaCore, i ElementC, /// Element type of C matrix LayoutC, /// Layout of C matrix (concept: MatrixLayout) Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) - AdditionOp, /// Addition operator of the semi-ring - MultiplicationOp, /// Multiplication operator of the semi-ring + RingOp, PartitionsK /// Number of partitions along K dimension >; diff --git a/include/cuasr/gemm/threadblock/default_srmma_core_sm80.h b/include/cuasr/gemm/threadblock/default_srmma_core_sm80.h new file mode 100644 index 0000000..158b6ed --- /dev/null +++ b/include/cuasr/gemm/threadblock/default_srmma_core_sm80.h @@ -0,0 +1,711 @@ +/*************************************************************************************************** + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Defines basic properties needed by CTA-level GEMMs assuming + expectations about data layout of the global memory fragments, data types, + and internal tile sizes. + + Partial specializations for threadblock::Mma operations targeting TensorOp + instructions. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/warp/mma_simt_policy.h" +#include "cutlass/gemm/warp/mma_simt.h" + + +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/pitch_linear_thread_map.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h" + +#include "cuasr/gemm/threadblock/default_srmma_core.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cuasr { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Ring operation that performs FMA + typename RingOp_, + /// Number of stages + int Stages, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultSrmmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = cutlass::layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + using RingOp = RingOp_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = cutlass::gemm::GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = cutlass::gemm::warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = cutlass::layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = cutlass::layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = cutlass::transform::threadblock::RegularTileAccessIterator< + cutlass::MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator B + using SmemThreadMapB = cutlass::transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to B operand + using SmemIteratorB = cutlass::transform::threadblock::RegularTileAccessIterator< + cutlass::MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / cutlass::sizeof_bits::value; + static const int numElementsB = 128 / cutlass::sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + + static_assert(!((Shape::kK / 32) % LaneN), + "Padding must be divisible by Lane"); + + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cuasr::gemm::warp::SrmmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) + RingOp + >; + + /// Policy used to define MmaPipelined + using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy< + MmaWarpSimt, + cutlass::MatrixShape<0, 0>, + cutlass::MatrixShape<0, Shape::kK / 32>, + WarpCount::kK + >; +}; + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Ring operation that performs FMA + typename RingOp_, + /// Number of stages + int Stages, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultSrmmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = cutlass::layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = cutlass::layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + using RingOp = RingOp_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = cutlass::gemm::GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = cutlass::gemm::warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = cutlass::layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = cutlass::layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = cutlass::transform::threadblock::RegularTileAccessIterator< + cutlass::MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = cutlass::transform::threadblock::RegularTileAccessIterator< + cutlass::MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / cutlass::sizeof_bits::value; + static const int numElementsB = 128 / cutlass::sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cuasr::gemm::warp::SrmmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) + RingOp + >; + + /// Policy used to define MmaPipelined + using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy< + MmaWarpSimt, + cutlass::MatrixShape<0, 0>, + cutlass::MatrixShape<0, 0>, + WarpCount::kK + >; +}; + +/// Partial specialization for SIMGEMMsT using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Ring operation that performs FMA + typename RingOp_, + /// Number of stages + int Stages, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultSrmmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + using RingOp = RingOp_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = cutlass::gemm::GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = cutlass::gemm::warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = cutlass::layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = cutlass::layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = cutlass::transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to A operand + using SmemIteratorA = cutlass::transform::threadblock::RegularTileAccessIterator< + cutlass::MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator B + using SmemThreadMapB = cutlass::transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to B operand + using SmemIteratorB = cutlass::transform::threadblock::RegularTileAccessIterator< + cutlass::MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / cutlass::sizeof_bits::value; + static const int numElementsB = 128 / cutlass::sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + + static_assert(!((Shape::kK / 32) % LaneM) && !((Shape::kK / 32) % LaneN), + "Padding must be divisible by Lane"); + + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cuasr::gemm::warp::SrmmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) + RingOp + >; + + /// Policy used to define MmaPipelined + using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy< + MmaWarpSimt, + cutlass::MatrixShape, + cutlass::MatrixShape<0, Shape::kK / 32>, + WarpCount::kK + >; +}; + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Ring operation that performs FMA + typename RingOp_, + /// Number of stages + int Stages, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultSrmmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = cutlass::layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + using RingOp = RingOp_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = cutlass::gemm::GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = cutlass::gemm::warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = cutlass::layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = cutlass::layout::RowMajor; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = cutlass::transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to A operand + using SmemIteratorA = cutlass::transform::threadblock::RegularTileAccessIterator< + cutlass::MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = cutlass::transform::threadblock::RegularTileAccessIterator< + cutlass::MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / cutlass::sizeof_bits::value; + static const int numElementsB = 128 / cutlass::sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + + static_assert(!((Shape::kK / 32) % LaneM), + "Padding must be divisible by Lane"); + + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cuasr::gemm::warp::SrmmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy, /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy) + RingOp + >; + + /// Policy used to define MmaPipelined + using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy< + MmaWarpSimt, + cutlass::MatrixShape, + cutlass::MatrixShape<0, 0>, + WarpCount::kK + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cuasr diff --git a/include/cuasr/gemm/threadblock/srmma_multistage.h b/include/cuasr/gemm/threadblock/srmma_multistage.h new file mode 100644 index 0000000..5bf460f --- /dev/null +++ b/include/cuasr/gemm/threadblock/srmma_multistage.h @@ -0,0 +1,583 @@ +/*************************************************************************************************** + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/threadblock/mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cuasr { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Use zfill or predicate for out-of-bound cp.async + cutlass::gemm::SharedMemoryClearOption SharedMemoryClear = cutlass::gemm::SharedMemoryClearOption::kNone, + /// Used for partial specialization + typename Enable = bool> +class SrmmaMultistage : + public cutlass::gemm::threadblock::MmaBase { +public: + ///< Base class + using Base = cutlass::gemm::threadblock::MmaBase; + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + ///< Iterates over tiles of A operand in global memory + using IteratorA = IteratorA_; + ///< Iterates over tiles of B operand in global memory + using IteratorB = IteratorB_; + ///< Data type of accumulator matrix + using ElementC = ElementC_; + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + ///< Policy describing tuning details + using Policy = Policy_; + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Minimum architecture is Sm80 to support cp.async + using ArchTag = cutlass::arch::Sm80; + + /// Complex transform on A operand + static cutlass::ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static cutlass::ComplexTransform const kTransformB = Operator::kTransformB; + + /// Internal structure exposed for introspection. + struct Detail { + + static_assert(Base::kWarpGemmIterations > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand A + static int const AsyncCopyIterationsPerStageA = + IteratorA::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load one stage of operand B + static int const AsyncCopyIterationsPerStageB = + IteratorB::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of cp.async instructions to load on group of operand A + static int const kAccessesPerGroupA = + (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB = + (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + }; + + private: + + using WarpLoadedFragmentA = typename Operator::FragmentA; + using WarpLoadedFragmentB = typename Operator::FragmentB; + + private: + + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + + /// Additive operator's identity value, used to initialize registers and smem + ElementC additive_identity_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + SrmmaMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx, + ///< Identity value of multiply op + ElementC additive_identity + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx), + smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx), + additive_identity_(additive_identity) + { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B, + int group_start_A = 0, int group_start_B = 0) { + iterator_A.set_iteration_index(group_start_A * + IteratorA::kAccessesPerVector); + this->smem_iterator_A_.set_iteration_index(group_start_A); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) { + if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) { + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + int const kSrcBytes = cutlass::sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_A.get(); + + if (SharedMemoryClear == cutlass::gemm::SharedMemoryClearOption::kZfill) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, gmem_ptr, iterator_A.valid()); + } else { + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_A.valid()); + } + + ++iterator_A; + } + + ++this->smem_iterator_A_; + } + } + + iterator_B.set_iteration_index(group_start_B * + IteratorB::kAccessesPerVector); + this->smem_iterator_B_.set_iteration_index(group_start_B); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) { + if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + int const kSrcBytes = cutlass::sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_B.get(); + + if (SharedMemoryClear == cutlass::gemm::SharedMemoryClearOption::kZfill) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, gmem_ptr, iterator_B.valid()); + } else { + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_B.valid()); + } + + ++iterator_B; + } + ++this->smem_iterator_B_; + } + } + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations, + ///< destination accumulator tile + FragmentC &accum, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + ///< initial value of accumulator + FragmentC const &src_accum) { + + // + // Prologue + // + + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < Base::kStages - 1; + ++stage, --gemm_k_iterations) { + + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + + iterator_A.set_iteration_index(0); + this->smem_iterator_A_.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + int const kSrcBytes = + cutlass::sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + int src_bytes = (iterator_A.valid() ? kSrcBytes : 0); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_A.get(), iterator_A.valid()); + + ++iterator_A; + } + + ++this->smem_iterator_A_; + } + + iterator_B.set_iteration_index(0); + this->smem_iterator_B_.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + int const kSrcBytes = + cutlass::sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_B.get(), iterator_B.valid()); + + ++iterator_B; + } + + ++this->smem_iterator_B_; + } + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Defines the boundary of a stage of cp.async. + cutlass::arch::cp_async_fence(); + } + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + // + // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels + // so that all accumulator elements outside the GEMM footprint are zero. + // + + if (SharedMemoryClear == cutlass::gemm::SharedMemoryClearOption::kClearLastStage) { + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_); + + typename IteratorA::AccessType zero_A; + zero_A.clear(); + + last_smem_iterator_A.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + last_smem_iterator_A.get()); + + *dst_ptr = zero_A; + + ++last_smem_iterator_A; + } + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_); + typename IteratorB::AccessType zero_B; + + zero_B.clear(); + last_smem_iterator_B.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + last_smem_iterator_B.get()); + + *dst_ptr = zero_B; + + ++last_smem_iterator_B; + } + } + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA warp_frag_A[2]; + WarpLoadedFragmentB warp_frag_B[2]; + warp_frag_A[0].fill(additive_identity_); + warp_frag_B[0].fill(additive_identity_); + warp_frag_A[1].fill(additive_identity_); + warp_frag_B[1].fill(additive_identity_); + + Operator warp_mma; + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > (-Base::kStages + 1);) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + warp_mma( + accum, + warp_frag_A[warp_mma_k % 2], + warp_frag_B[warp_mma_k % 2], + accum + ); + + // Issue global->shared copies for the this stage + if (warp_mma_k < Base::kWarpGemmIterations - 1) { + int group_start_iteration_A, group_start_iteration_B; + + group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA; + group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, + group_start_iteration_B); + } + + if (warp_mma_k + 2 == Base::kWarpGemmIterations) { + int group_start_iteration_A, group_start_iteration_B; + group_start_iteration_A = + (warp_mma_k + 1) * Detail::kAccessesPerGroupA; + group_start_iteration_B = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, + group_start_iteration_B); + + // Inserts a memory fence between stages of cp.async instructions. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_A_.add_tile_offset( + {0, -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations; + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + } + } + } + + if (SharedMemoryClear == cutlass::gemm::SharedMemoryClearOption::kZfill) { + // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop + cutlass::arch::cp_async_fence(); + cutlass::arch::cp_async_wait<0>(); + __syncthreads(); + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cuasr + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cuasr/gemm/threadblock/srmma_pipelined.h b/include/cuasr/gemm/threadblock/srmma_pipelined.h index 49ca9c1..f3d59ce 100644 --- a/include/cuasr/gemm/threadblock/srmma_pipelined.h +++ b/include/cuasr/gemm/threadblock/srmma_pipelined.h @@ -1,5 +1,33 @@ /*************************************************************************************************** - * Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ /*! \file \brief Template for a double-buffered threadblock-scoped GEMM kernel. @@ -53,8 +81,7 @@ template < typename SmemIteratorA_::Element, typename IteratorA_::Element, IteratorA_::Fragment::kElements>, - /// - /// Transformation applied to A operand + /// Transformation applied to B operand typename TransformB_ = cutlass::NumericArrayConverter< typename SmemIteratorB_::Element, typename IteratorB_::Element, @@ -124,7 +151,7 @@ class SrmmaPipelined : public cutlass::gemm::threadblock::MmaBase class SrmmaSimt { public: @@ -82,9 +106,8 @@ class SrmmaSimt { /// Indicates class of matrix operator using OperatorClass = cutlass::arch::OpClassSimt; - /// Underlying semi-ring operators - using AdditionOp = AdditionOp_; - using MultiplicationOp = MultiplicationOp_; + /// Ring operation that performs FMA + using RingOp = RingOp_; using ThreadLayoutA = typename cutlass::platform::conditional< cutlass::platform::is_same, LayoutA>:: @@ -128,9 +151,7 @@ class SrmmaSimt { ThreadLayoutB, ElementC, LayoutC, - AdditionOp, - MultiplicationOp, - dp4a_type + RingOp >; public: @@ -148,6 +169,8 @@ class SrmmaSimt { /// Storage for A tile using FragmentA = typename IteratorA::Fragment; + /// Storage for transformed A tile + using TransformedFragmentA = FragmentA; /// Iterates over the B operand in memory using IteratorB = cutlass::gemm::warp::MmaSimtTileIterator< @@ -162,6 +185,8 @@ class SrmmaSimt { /// Storage for B tile using FragmentB = typename IteratorB::Fragment; + /// Storage for transformed B tile + using TransformedFragmentB = FragmentB; /// Iterates over the C operand in memory using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator< @@ -197,6 +222,15 @@ class SrmmaSimt { srmma(d, a, b, c); } + + /// Transform the mma operands to the required types + CUTLASS_DEVICE + void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, + FragmentA const &A, FragmentB const &B) const { + //TODO: Implement this + dst_A = A; + dst_B = B; + } }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cuasr/reduction/kernel/reduce_split_k.h b/include/cuasr/reduction/kernel/reduce_split_k.h index 83397ef..229e591 100644 --- a/include/cuasr/reduction/kernel/reduce_split_k.h +++ b/include/cuasr/reduction/kernel/reduce_split_k.h @@ -1,24 +1,31 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ @@ -116,9 +123,7 @@ class ReduceSplitK { destination(destination_), source(source_), output(output_), - reduction(reduction_) { - - } + reduction(reduction_) { } }; struct SharedStorage { }; @@ -147,8 +152,8 @@ class ReduceSplitK { // Determine CTA position cutlass::MatrixCoord thread_offset( - int(blockIdx.x) * Shape::kRow + threadIdx.y, - int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess + cutlass::MatrixCoord::Index(int(blockIdx.x) * Shape::kRow + threadIdx.y), + cutlass::MatrixCoord::Index(int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess) ); // One guard conditional diff --git a/include/cuasr/reduction/thread/reduce.h b/include/cuasr/reduction/thread/reduce.h index 5323eae..a0b0991 100644 --- a/include/cuasr/reduction/thread/reduce.h +++ b/include/cuasr/reduction/thread/reduce.h @@ -1,24 +1,31 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ @@ -41,18 +48,18 @@ namespace reduction { namespace thread { // Structure to compute the thread level reduction with semiring addition operator -template +template struct Reduce { CUTLASS_HOST_DEVICE T operator()(T lhs, T const &rhs) const { - AdditionOp add; - return add(lhs, rhs); + RingOp ring_op; + return ring_op.add(lhs, rhs); } CUTLASS_HOST_DEVICE cutlass::Array operator()(cutlass::Array const &in) const { cutlass::Array result; - result.fill(AdditionOp::Identity); + result.fill(RingOp::AddIdentity); CUTLASS_PRAGMA_UNROLL for (auto i = 0; i < N; ++i) { diff --git a/include/cuasr/reduction/thread/reduction_operators.h b/include/cuasr/reduction/thread/reduction_operators.h index e36b95c..576df91 100644 --- a/include/cuasr/reduction/thread/reduction_operators.h +++ b/include/cuasr/reduction/thread/reduction_operators.h @@ -1,24 +1,31 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ @@ -41,20 +48,20 @@ namespace thread { /// Mixed-precision reduction with a functional reduction operator template < - typename AdditionOp_, + typename RingOp_, typename ElementAccumulator_, typename Element_, int Count = 1 > struct SemiringReduce { // Type aliases - using AdditionOp = AdditionOp_; + using RingOp = RingOp_; using ElementAccumulator = ElementAccumulator_; using Element = Element_; // Static members static int const kCount = Count; - static Element constexpr Identity = AdditionOp::Identity; + static Element constexpr Identity = RingOp::AddIdentity; using FragmentAccumulator = cutlass::Array; using FragmentElement = cutlass::Array; @@ -65,11 +72,9 @@ struct SemiringReduce { // Data members Params params; - // Methods - /// Constructor CUTLASS_HOST_DEVICE - SemiringReduce(Params params_ = Params()): params(params_) { } + SemiringReduce(Params params) : params(params) { }; /// Operator CUTLASS_HOST_DEVICE @@ -77,14 +82,19 @@ struct SemiringReduce { FragmentAccumulator accumulator, FragmentElement element) const { - AdditionOp op; + RingOp ring_op; cutlass::NumericArrayConverter< ElementAccumulator, Element, kCount, cutlass::PreferredRoundingMode::kRound> converter; - return op(accumulator, converter(element)); + FragmentAccumulator retval; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + retval[i] = ring_op.add(accumulator[i], converter(element)[i]); + } + return retval; } }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c094d68..4cfe1f2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,5 +20,4 @@ if(NOT EXISTS "${PROJECT_SOURCE_DIR}/test/gtest/googletest/include") endif() add_subdirectory(gtest) -add_subdirectory(regress) add_subdirectory(device) diff --git a/test/device/CMakeLists.txt b/test/device/CMakeLists.txt index 328a9ea..90bdda5 100644 --- a/test/device/CMakeLists.txt +++ b/test/device/CMakeLists.txt @@ -1,27 +1,72 @@ -file(GLOB SIMT_TEST_SRCS CONFIGURE_DEPENDS *.cu) -add_executable(cuasr_test_srgemm_device +# SM50 default tests +add_executable(cuasr_test_srgemm_device_sm50_defaults ${PROJECT_SOURCE_DIR}/test/harness.cpp - ${SIMT_TEST_SRCS} + sm50_defaults.cu ) target_include_directories( - cuasr_test_srgemm_device + cuasr_test_srgemm_device_sm50_defaults PRIVATE ${PROJECT_SOURCE_DIR}/include/ ${PROJECT_SOURCE_DIR}/tools/include/ ${PROJECT_SOURCE_DIR}/cutlass/include/ ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/ ) -target_link_libraries(cuasr_test_srgemm_device +target_link_libraries(cuasr_test_srgemm_device_sm50_defaults gtest ${cuASR_LIB_NAME} ) add_test( - NAME cuasr_test_srgemm_device - COMMAND cuasr_test_srgemm_device + NAME cuasr_test_srgemm_device_sm50_defaults + COMMAND cuasr_test_srgemm_device_sm50_defaults +) + +# SM50 all shmoo tests +file(GLOB SM50_SIMT_TEST_SRCS CONFIGURE_DEPENDS sm50_simt_*.cu) +add_executable(cuasr_test_srgemm_device_sm50_shmoo + ${PROJECT_SOURCE_DIR}/test/harness.cpp + ${SM50_SIMT_TEST_SRCS} +) +target_include_directories( + cuasr_test_srgemm_device_sm50_shmoo + PRIVATE + ${PROJECT_SOURCE_DIR}/include/ + ${PROJECT_SOURCE_DIR}/tools/include/ + ${PROJECT_SOURCE_DIR}/cutlass/include/ + ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/ +) +target_link_libraries(cuasr_test_srgemm_device_sm50_shmoo + gtest + ${cuASR_LIB_NAME} +) +add_test( + NAME cuasr_test_srgemm_device_sm50_shmoo + COMMAND cuasr_test_srgemm_device_sm50_shmoo ) if(NOT DEFINED CUASR_TEST_LEVEL) set(CUASR_TEST_LEVEL 0) endif() -target_compile_definitions(cuasr_test_srgemm_device +target_compile_definitions(cuasr_test_srgemm_device_sm50_shmoo PRIVATE CUASR_TEST_LEVEL=${CUASR_TEST_LEVEL} ) + +# SM80 default tests +add_executable(cuasr_test_srgemm_device_sm80_defaults + ${PROJECT_SOURCE_DIR}/test/harness.cpp + sm80_defaults.cu +) +target_include_directories( + cuasr_test_srgemm_device_sm80_defaults + PRIVATE + ${PROJECT_SOURCE_DIR}/include/ + ${PROJECT_SOURCE_DIR}/tools/include/ + ${PROJECT_SOURCE_DIR}/cutlass/include/ + ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/ +) +target_link_libraries(cuasr_test_srgemm_device_sm80_defaults + gtest + ${cuASR_LIB_NAME} +) +add_test( + NAME cuasr_test_srgemm_device_sm80_defaults + COMMAND cuasr_test_srgemm_device_sm80_defaults +) diff --git a/test/device/gen_default_test.py b/test/device/gen_default_test.py new file mode 100644 index 0000000..3976c5e --- /dev/null +++ b/test/device/gen_default_test.py @@ -0,0 +1,171 @@ +import os +import sys +import argparse + +################################################################################ +# This file generates teset cases for all defualt SRGEMM configurations. +################################################################################ + +precisions = [ + ["f64", "double"], + ["f32", "float"], + ["s32", "int"] +] + +tnspposes = [ + [False, False, True], + [False, False, False], + [False, True, True], + [False, True, False], + [True, False, True], + [True, False, False], + [True, True, True], + [True, True, False], +] + +semiring_operators = [ + ["plus", "mult"], # regular GEMM + ["min", "plus"], # min-plus (tropical) + ["max", "plus"], # max-plus + ["min", "max"], # min-max + ["max", "min"], # max-min + ["min", "mult"], # min-multiplies + ["max", "mult"], # max-multiplies + ["or", "and"] # or-and +] + + +testfile_header = """\ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +""" + + +test_template = """\ +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM{sm_arch}_default_{add_op}_{mult_op}_srgemm, {precision_char}_{tnspA}{tnspB}_{tnspC}) {{ + using precision = {precision_type}; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm{sm_arch}; + using RingOp = cuasr::{add_op}_{mult_op}; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::{tnsp_typeA}Major, // + precision, cutlass::layout::{tnsp_typeB}Major, // + precision, cutlass::layout::{tnsp_typeC}Major, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +}} + +""" + + +def write_test_file_header(testfile): + testfile.write(testfile_header) + + +def write_test_to_file( + testfile, + sm_arch, + add_op, + mult_op, + precision_char, + precision_type, + tnspA, + tnspB, + tnspC): + tnsp_typeA = "Column" if tnspA == "n" else "Row" + tnsp_typeB = "Column" if tnspB == "n" else "Row" + tnsp_typeC = "Column" if tnspC == "n" else "Row" + testfile.write(test_template.format( + sm_arch=sm_arch, + add_op=add_op, + mult_op=mult_op, + precision_char=precision_char, + precision_type=precision_type, + tnspA=tnspA, + tnspB=tnspB, + tnspC=tnspC, + tnsp_typeA=tnsp_typeA, + tnsp_typeB=tnsp_typeB, + tnsp_typeC=tnsp_typeC + )) + + +def main(args): + num_testes = 0 + testfile_name = "sm{}_defaults.cu".format(args.sm_arch) + print(testfile_name) + filePath = os.path.join(args.output_dir, testfile_name) + + # open file and gen all default tests + with open(filePath, "w") as testfile: + write_test_file_header(testfile) + + # for all semirings + for add_op, mult_op in semiring_operators: + # for all precisions + for precision in precisions: + precision_char = precision[0] + precision_type = precision[1] + + # tnspposes + for tnsppose in tnspposes: + # get tnsppose char + column_major_A = tnsppose[0] + column_major_B = tnsppose[1] + column_major_C = tnsppose[2] + tnspA = "n" if column_major_A else "t" + tnspB = "n" if column_major_B else "t" + tnspC = "n" if column_major_C else "t" + + # write to file + write_test_to_file( + testfile, + args.sm_arch, + add_op, + mult_op, + precision_char, + precision_type, + tnspA, + tnspB, + tnspC) + num_testes += 1 + print("Total test count per semi-ring = {}".format( + num_testes // len(semiring_operators))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--output-dir", type=str, required=False, default=".", + help="Path to the output dir.") + parser.add_argument("-sm", "--sm-arch", type=int, required=False, default=50, choices=[50, 80], + help="SM architecture version number,") + args = parser.parse_args(sys.argv[1:]) + main(args) diff --git a/test/device/simt_sm50.py b/test/device/gen_simt.py similarity index 82% rename from test/device/simt_sm50.py rename to test/device/gen_simt.py index 1339891..413b496 100644 --- a/test/device/simt_sm50.py +++ b/test/device/gen_simt.py @@ -1,4 +1,6 @@ import os +import sys +import argparse # this file creates the test/unit/gemm/device simt tests and the CMake file to go with it ################################################################################ @@ -25,10 +27,9 @@ # char, type bits/elem, max tile, L0 threadblock tiles precisions = [ - ["d", "double", 64, 64*64, [[64, 64], [32, 32]]], - ["s", "float", 32, 128 * - 128, [[128, 256], [128, 128], [64, 64]]], - # ["h", "cutlass::half_t", 16, 128*256, [ [256, 128], [ 64, 128], [ 64, 32] ] ], + ["f64", "double", 64, 128 * 64, [[128, 64], [ 64, 64], [ 32, 32]]], + ["f32", "float", 32, 256 * 128, [[256, 128], [128, 128], [128, 64], [64, 64]]], + # ["h", "cutlass::half_t", 16, 128*256, [[256, 128], [ 64, 128], [ 64, 32] ] ], # ["i", "int", 32, 128*128, [[128, 64], [16, 32]]], ] @@ -44,19 +45,19 @@ ] semiring_operators = [ - ["plus", "multiplies"], # regular GEMM - ["minimum", "plus"], # min-plus (tropical) - ["maximum", "plus"], # max-plus - ["minimum", "maximum"], # min-max - ["maximum", "minimum"], # max-min - ["minimum", "multiplies"], # min-multiplies - ["maximum", "multiplies"], # max-multiplies - ["binary_or", "binary_and"] # or-and + ["plus", "mult"], # regular GEMM + ["min", "plus"], # min-plus (tropical) + ["max", "plus"], # max-plus + ["min", "max"], # min-max + ["max", "min"], # max-min + ["min", "mult"], # min-multiplies + ["max", "mult"], # max-multiplies + ["or", "and"] # or-and ] testfile_header = """\ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -92,25 +93,24 @@ test_template = """\ #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= {21}) -TEST(SM50_device_{0}_{1}_{2}srgemm_{4}{5}_{6}, {10}x{11}x{12}_{13}x{14}x1_{15}x{16}_{17}x{18}_{19}x{20}) {{ +TEST(SM{22}_device_{0}_{1}_{2}_srgemm_{4}{5}_{6}, {10}x{11}x{12}_{13}x{14}x1_{15}x{16}_{17}x{18}_{19}x{20}) {{ using precision = {3}; using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; + using SmArch = cutlass::arch::Sm{22}; + using RingOp = cuasr::{0}_{1}; using ThreadblockShape = cutlass::gemm::GemmShape<{10}, {11}, {12}>; using WarpShape = cutlass::gemm::GemmShape<{13}, {14}, {12}>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::{0}, cuasr::{1}, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::{7}Major, // precision, cutlass::layout::{8}Major, // precision, cutlass::layout::{9}Major, // @@ -135,9 +135,9 @@ def write_test_to_file( mult_op, precision_char, precision_type, - transA, - transB, - transC, + tnspA, + tnspB, + tnspC, threadblock_tile, unroll, warp_shape, @@ -146,7 +146,8 @@ def write_test_to_file( warp_threadsM, warp_threadsN, warps_per_tb, - test_level): + test_level, + sm_arch): print("{:.0f}x{:.0f}x{:.0f}__{:.0f}x{:.0f}_{:.0f}x{:.0f}_{:.0f}x{:.0f}".format( threadblock_tile[0], threadblock_tile[1], unroll, thread_tileM, thread_tileN, @@ -160,21 +161,21 @@ def write_test_to_file( threadblock_tile[0], threadblock_tile[1], unroll )) - trans_typeA = "Column" if transA == "n" else "Row" - trans_typeB = "Column" if transB == "n" else "Row" - trans_typeC = "Column" if transC == "n" else "Row" + tnsp_typeA = "Column" if tnspA == "n" else "Row" + tnsp_typeB = "Column" if tnspB == "n" else "Row" + tnsp_typeC = "Column" if tnspC == "n" else "Row" print(precision_type) testfile.write(test_template.format( add_op, # 0 mult_op, # 1 precision_char, # 2 precision_type, # 3 - transA, # 4 - transB, # 5 - transC, # 6 - trans_typeA, # 7 - trans_typeB, # 8 - trans_typeC, # 9 + tnspA, # 4 + tnspB, # 5 + tnspC, # 6 + tnsp_typeA, # 7 + tnsp_typeB, # 8 + tnsp_typeC, # 9 int(threadblock_tile[0]), # 10 int(threadblock_tile[1]), # 11 int(unroll), # 12 @@ -186,11 +187,12 @@ def write_test_to_file( int(warp_threadsN), # 18 int(warps_per_tb[0]), # 19 int(warps_per_tb[1]), # 20 - int(test_level) # 21 + int(test_level), # 21 + int(sm_arch) # 22 )) -def main(output_dir: str): +def main(args): # warps per threadblock warps_per_threadblocks = [] for warps_per_tb0 in WARPS_PER_TB_EDGE: @@ -237,17 +239,17 @@ def main(output_dir: str): column_major_A = transpose[0] column_major_B = transpose[1] column_major_C = transpose[2] - transA = "n" if column_major_A else "t" - transB = "n" if column_major_B else "t" - transC = "n" if column_major_C else "t" + tnspA = "n" if column_major_A else "t" + tnspB = "n" if column_major_B else "t" + tnspC = "n" if column_major_C else "t" # open file - testfile_name = "simt_{}_{}_{}srgemm_{}{}_{}_sm50.cu".format( - add_op, mult_op, precision_char, - transA, transB, transC) + testfile_name = "sm{}_simt_{}_{}_{}_srgemm_{}{}_{}.cu".format( + args.sm_arch, add_op, mult_op, precision_char, + tnspA, tnspB, tnspC) print("\n", testfile_name) - filePath = os.path.join(output_dir, testfile_name) + filePath = os.path.join(args.output_dir, testfile_name) with open(filePath, "w") as testfile: write_test_file_header(testfile) @@ -351,9 +353,9 @@ def main(output_dir: str): mult_op, precision_char, precision_type, - transA, - transB, - transC, + tnspA, + tnspB, + tnspC, threadblock_tile, unroll, warp_shape, @@ -362,10 +364,17 @@ def main(output_dir: str): warp_threadsM, warp_threadsN, warps_per_tb, - test_level) + test_level, + args.sm_arch) num_tests += 1 print("Total test count per semi-ring = {}".format(num_tests//len(semiring_operators))) if __name__ == "__main__": - main(".") + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--output-dir", type=str, required=False, default=".", + help="Path to the output dir.") + parser.add_argument("-sm", "--sm-arch", type=int, required=False, default=50, choices=[50, 80], + help="SM architecture version number,") + args = parser.parse_args(sys.argv[1:]) + main(args) diff --git a/test/device/simt_binary_or_binary_and_dsrgemm_nn_t_sm50.cu b/test/device/simt_binary_or_binary_and_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index 48e3855..0000000 --- a/test/device/simt_binary_or_binary_and_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_dsrgemm_nt_n_sm50.cu b/test/device/simt_binary_or_binary_and_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index a4e49ed..0000000 --- a/test/device/simt_binary_or_binary_and_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_dsrgemm_tn_n_sm50.cu b/test/device/simt_binary_or_binary_and_dsrgemm_tn_n_sm50.cu deleted file mode 100644 index 8ceca1d..0000000 --- a/test/device/simt_binary_or_binary_and_dsrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_dsrgemm_tn_t_sm50.cu b/test/device/simt_binary_or_binary_and_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index f6dd6d6..0000000 --- a/test/device/simt_binary_or_binary_and_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_dsrgemm_tt_n_sm50.cu b/test/device/simt_binary_or_binary_and_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index 49958ad..0000000 --- a/test/device/simt_binary_or_binary_and_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_ssrgemm_nn_n_sm50.cu b/test/device/simt_binary_or_binary_and_ssrgemm_nn_n_sm50.cu deleted file mode 100644 index 17176dd..0000000 --- a/test/device/simt_binary_or_binary_and_ssrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_ssrgemm_nn_t_sm50.cu b/test/device/simt_binary_or_binary_and_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index 7bededb..0000000 --- a/test/device/simt_binary_or_binary_and_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_ssrgemm_nt_n_sm50.cu b/test/device/simt_binary_or_binary_and_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index 102a7e1..0000000 --- a/test/device/simt_binary_or_binary_and_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_ssrgemm_nt_t_sm50.cu b/test/device/simt_binary_or_binary_and_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index ae6fafe..0000000 --- a/test/device/simt_binary_or_binary_and_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_ssrgemm_tn_n_sm50.cu b/test/device/simt_binary_or_binary_and_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index f2a06a6..0000000 --- a/test/device/simt_binary_or_binary_and_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_ssrgemm_tn_t_sm50.cu b/test/device/simt_binary_or_binary_and_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 79bd59f..0000000 --- a/test/device/simt_binary_or_binary_and_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_ssrgemm_tt_n_sm50.cu b/test/device/simt_binary_or_binary_and_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 34e7c3b..0000000 --- a/test/device/simt_binary_or_binary_and_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_binary_or_binary_and_ssrgemm_tt_t_sm50.cu b/test/device/simt_binary_or_binary_and_ssrgemm_tt_t_sm50.cu deleted file mode 100644 index 5fe557d..0000000 --- a/test/device/simt_binary_or_binary_and_ssrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_ssrgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_dsrgemm_nn_t_sm50.cu b/test/device/simt_maximum_minimum_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index 300355c..0000000 --- a/test/device/simt_maximum_minimum_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_dsrgemm_nt_n_sm50.cu b/test/device/simt_maximum_minimum_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index 35a92b9..0000000 --- a/test/device/simt_maximum_minimum_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_dsrgemm_nt_t_sm50.cu b/test/device/simt_maximum_minimum_dsrgemm_nt_t_sm50.cu deleted file mode 100644 index 5f41aaf..0000000 --- a/test/device/simt_maximum_minimum_dsrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_dsrgemm_tn_n_sm50.cu b/test/device/simt_maximum_minimum_dsrgemm_tn_n_sm50.cu deleted file mode 100644 index 391db03..0000000 --- a/test/device/simt_maximum_minimum_dsrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_dsrgemm_tn_t_sm50.cu b/test/device/simt_maximum_minimum_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index 59a437b..0000000 --- a/test/device/simt_maximum_minimum_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_dsrgemm_tt_n_sm50.cu b/test/device/simt_maximum_minimum_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index 194615c..0000000 --- a/test/device/simt_maximum_minimum_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_dsrgemm_tt_t_sm50.cu b/test/device/simt_maximum_minimum_dsrgemm_tt_t_sm50.cu deleted file mode 100644 index 56f8f95..0000000 --- a/test/device/simt_maximum_minimum_dsrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_ssrgemm_nn_t_sm50.cu b/test/device/simt_maximum_minimum_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index 64786fe..0000000 --- a/test/device/simt_maximum_minimum_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_ssrgemm_nt_n_sm50.cu b/test/device/simt_maximum_minimum_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index 1a983dc..0000000 --- a/test/device/simt_maximum_minimum_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_ssrgemm_nt_t_sm50.cu b/test/device/simt_maximum_minimum_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index a4a5981..0000000 --- a/test/device/simt_maximum_minimum_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_ssrgemm_tn_n_sm50.cu b/test/device/simt_maximum_minimum_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index 491ced1..0000000 --- a/test/device/simt_maximum_minimum_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_ssrgemm_tn_t_sm50.cu b/test/device/simt_maximum_minimum_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 2ff0802..0000000 --- a/test/device/simt_maximum_minimum_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_minimum_ssrgemm_tt_n_sm50.cu b/test/device/simt_maximum_minimum_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 7360a72..0000000 --- a/test/device/simt_maximum_minimum_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_dsrgemm_nn_t_sm50.cu b/test/device/simt_maximum_multiplies_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index 2cbfc31..0000000 --- a/test/device/simt_maximum_multiplies_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_dsrgemm_nt_n_sm50.cu b/test/device/simt_maximum_multiplies_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index f431ccc..0000000 --- a/test/device/simt_maximum_multiplies_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_dsrgemm_tn_n_sm50.cu b/test/device/simt_maximum_multiplies_dsrgemm_tn_n_sm50.cu deleted file mode 100644 index 962f529..0000000 --- a/test/device/simt_maximum_multiplies_dsrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_dsrgemm_tn_t_sm50.cu b/test/device/simt_maximum_multiplies_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index f7184e0..0000000 --- a/test/device/simt_maximum_multiplies_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_dsrgemm_tt_n_sm50.cu b/test/device/simt_maximum_multiplies_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index 58f7d5e..0000000 --- a/test/device/simt_maximum_multiplies_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_ssrgemm_nn_n_sm50.cu b/test/device/simt_maximum_multiplies_ssrgemm_nn_n_sm50.cu deleted file mode 100644 index 28cf3e2..0000000 --- a/test/device/simt_maximum_multiplies_ssrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_ssrgemm_nn_t_sm50.cu b/test/device/simt_maximum_multiplies_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index 33f4052..0000000 --- a/test/device/simt_maximum_multiplies_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_ssrgemm_nt_n_sm50.cu b/test/device/simt_maximum_multiplies_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index 61763eb..0000000 --- a/test/device/simt_maximum_multiplies_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_ssrgemm_nt_t_sm50.cu b/test/device/simt_maximum_multiplies_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index 9de8d19..0000000 --- a/test/device/simt_maximum_multiplies_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_ssrgemm_tn_n_sm50.cu b/test/device/simt_maximum_multiplies_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index b79d773..0000000 --- a/test/device/simt_maximum_multiplies_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_ssrgemm_tn_t_sm50.cu b/test/device/simt_maximum_multiplies_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 0b0de63..0000000 --- a/test/device/simt_maximum_multiplies_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_ssrgemm_tt_n_sm50.cu b/test/device/simt_maximum_multiplies_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 25220fa..0000000 --- a/test/device/simt_maximum_multiplies_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_maximum_multiplies_ssrgemm_tt_t_sm50.cu b/test/device/simt_maximum_multiplies_ssrgemm_tt_t_sm50.cu deleted file mode 100644 index ed894ad..0000000 --- a/test/device/simt_maximum_multiplies_ssrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_ssrgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_dsrgemm_nn_n_sm50.cu b/test/device/simt_minimum_maximum_dsrgemm_nn_n_sm50.cu deleted file mode 100644 index decf223..0000000 --- a/test/device/simt_minimum_maximum_dsrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_dsrgemm_nn_t_sm50.cu b/test/device/simt_minimum_maximum_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index fbd514f..0000000 --- a/test/device/simt_minimum_maximum_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_dsrgemm_nt_n_sm50.cu b/test/device/simt_minimum_maximum_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index ee22572..0000000 --- a/test/device/simt_minimum_maximum_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_dsrgemm_nt_t_sm50.cu b/test/device/simt_minimum_maximum_dsrgemm_nt_t_sm50.cu deleted file mode 100644 index 786188f..0000000 --- a/test/device/simt_minimum_maximum_dsrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_dsrgemm_tn_n_sm50.cu b/test/device/simt_minimum_maximum_dsrgemm_tn_n_sm50.cu deleted file mode 100644 index 4ae4217..0000000 --- a/test/device/simt_minimum_maximum_dsrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_dsrgemm_tn_t_sm50.cu b/test/device/simt_minimum_maximum_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index 42aee49..0000000 --- a/test/device/simt_minimum_maximum_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_dsrgemm_tt_n_sm50.cu b/test/device/simt_minimum_maximum_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index e5c9a0f..0000000 --- a/test/device/simt_minimum_maximum_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_dsrgemm_tt_t_sm50.cu b/test/device/simt_minimum_maximum_dsrgemm_tt_t_sm50.cu deleted file mode 100644 index 3765f4b..0000000 --- a/test/device/simt_minimum_maximum_dsrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_dsrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_ssrgemm_nn_t_sm50.cu b/test/device/simt_minimum_maximum_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index edf13bc..0000000 --- a/test/device/simt_minimum_maximum_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_ssrgemm_nt_n_sm50.cu b/test/device/simt_minimum_maximum_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index 8f2a3a7..0000000 --- a/test/device/simt_minimum_maximum_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_ssrgemm_nt_t_sm50.cu b/test/device/simt_minimum_maximum_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index f898ac7..0000000 --- a/test/device/simt_minimum_maximum_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_ssrgemm_tn_n_sm50.cu b/test/device/simt_minimum_maximum_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index 6611645..0000000 --- a/test/device/simt_minimum_maximum_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_ssrgemm_tn_t_sm50.cu b/test/device/simt_minimum_maximum_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index 5c3ae16..0000000 --- a/test/device/simt_minimum_maximum_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_maximum_ssrgemm_tt_n_sm50.cu b/test/device/simt_minimum_maximum_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 2184d53..0000000 --- a/test/device/simt_minimum_maximum_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_dsrgemm_nn_n_sm50.cu b/test/device/simt_minimum_multiplies_dsrgemm_nn_n_sm50.cu deleted file mode 100644 index 79d645d..0000000 --- a/test/device/simt_minimum_multiplies_dsrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_dsrgemm_nn_t_sm50.cu b/test/device/simt_minimum_multiplies_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index 3645cc8..0000000 --- a/test/device/simt_minimum_multiplies_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_dsrgemm_nt_n_sm50.cu b/test/device/simt_minimum_multiplies_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index 56ca8b7..0000000 --- a/test/device/simt_minimum_multiplies_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_dsrgemm_nt_t_sm50.cu b/test/device/simt_minimum_multiplies_dsrgemm_nt_t_sm50.cu deleted file mode 100644 index 496c439..0000000 --- a/test/device/simt_minimum_multiplies_dsrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_dsrgemm_tn_n_sm50.cu b/test/device/simt_minimum_multiplies_dsrgemm_tn_n_sm50.cu deleted file mode 100644 index e9bfe55..0000000 --- a/test/device/simt_minimum_multiplies_dsrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_dsrgemm_tn_t_sm50.cu b/test/device/simt_minimum_multiplies_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index 669b8cd..0000000 --- a/test/device/simt_minimum_multiplies_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_dsrgemm_tt_n_sm50.cu b/test/device/simt_minimum_multiplies_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index 29fd0e7..0000000 --- a/test/device/simt_minimum_multiplies_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_ssrgemm_nn_n_sm50.cu b/test/device/simt_minimum_multiplies_ssrgemm_nn_n_sm50.cu deleted file mode 100644 index 48c4034..0000000 --- a/test/device/simt_minimum_multiplies_ssrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_ssrgemm_nn_t_sm50.cu b/test/device/simt_minimum_multiplies_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index 3f7a034..0000000 --- a/test/device/simt_minimum_multiplies_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_ssrgemm_nt_n_sm50.cu b/test/device/simt_minimum_multiplies_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index 776736d..0000000 --- a/test/device/simt_minimum_multiplies_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_ssrgemm_nt_t_sm50.cu b/test/device/simt_minimum_multiplies_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index 4388eb4..0000000 --- a/test/device/simt_minimum_multiplies_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_ssrgemm_tn_n_sm50.cu b/test/device/simt_minimum_multiplies_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index 35aeaca..0000000 --- a/test/device/simt_minimum_multiplies_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_ssrgemm_tn_t_sm50.cu b/test/device/simt_minimum_multiplies_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index bb8b9c7..0000000 --- a/test/device/simt_minimum_multiplies_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_ssrgemm_tt_n_sm50.cu b/test/device/simt_minimum_multiplies_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index 9521aba..0000000 --- a/test/device/simt_minimum_multiplies_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_multiplies_ssrgemm_tt_t_sm50.cu b/test/device/simt_minimum_multiplies_ssrgemm_tt_t_sm50.cu deleted file mode 100644 index b4fd358..0000000 --- a/test/device/simt_minimum_multiplies_ssrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_ssrgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_plus_dsrgemm_tn_t_sm50.cu b/test/device/simt_minimum_plus_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index 216685f..0000000 --- a/test/device/simt_minimum_plus_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_plus_dsrgemm_tt_n_sm50.cu b/test/device/simt_minimum_plus_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index b3da2f5..0000000 --- a/test/device/simt_minimum_plus_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_minimum_plus_ssrgemm_tn_n_sm50.cu b/test/device/simt_minimum_plus_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index 9d2dd21..0000000 --- a/test/device/simt_minimum_plus_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_dsrgemm_nn_n_sm50.cu b/test/device/simt_plus_multiplies_dsrgemm_nn_n_sm50.cu deleted file mode 100644 index 97d031f..0000000 --- a/test/device/simt_plus_multiplies_dsrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_dsrgemm_nn_t_sm50.cu b/test/device/simt_plus_multiplies_dsrgemm_nn_t_sm50.cu deleted file mode 100644 index 4c238e5..0000000 --- a/test/device/simt_plus_multiplies_dsrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_dsrgemm_nt_n_sm50.cu b/test/device/simt_plus_multiplies_dsrgemm_nt_n_sm50.cu deleted file mode 100644 index e958ee7..0000000 --- a/test/device/simt_plus_multiplies_dsrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_dsrgemm_nt_t_sm50.cu b/test/device/simt_plus_multiplies_dsrgemm_nt_t_sm50.cu deleted file mode 100644 index ddc481e..0000000 --- a/test/device/simt_plus_multiplies_dsrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_dsrgemm_tn_t_sm50.cu b/test/device/simt_plus_multiplies_dsrgemm_tn_t_sm50.cu deleted file mode 100644 index bbd7618..0000000 --- a/test/device/simt_plus_multiplies_dsrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_dsrgemm_tt_n_sm50.cu b/test/device/simt_plus_multiplies_dsrgemm_tt_n_sm50.cu deleted file mode 100644 index f92a16c..0000000 --- a/test/device/simt_plus_multiplies_dsrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_dsrgemm_tt_t_sm50.cu b/test/device/simt_plus_multiplies_dsrgemm_tt_t_sm50.cu deleted file mode 100644 index 70987c3..0000000 --- a/test/device/simt_plus_multiplies_dsrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,1321 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = double; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_ssrgemm_nn_n_sm50.cu b/test/device/simt_plus_multiplies_ssrgemm_nn_n_sm50.cu deleted file mode 100644 index e43e284..0000000 --- a/test/device/simt_plus_multiplies_ssrgemm_nn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_ssrgemm_nn_t_sm50.cu b/test/device/simt_plus_multiplies_ssrgemm_nn_t_sm50.cu deleted file mode 100644 index f3718d6..0000000 --- a/test/device/simt_plus_multiplies_ssrgemm_nn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_ssrgemm_nt_n_sm50.cu b/test/device/simt_plus_multiplies_ssrgemm_nt_n_sm50.cu deleted file mode 100644 index 3c59a75..0000000 --- a/test/device/simt_plus_multiplies_ssrgemm_nt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_ssrgemm_nt_t_sm50.cu b/test/device/simt_plus_multiplies_ssrgemm_nt_t_sm50.cu deleted file mode 100644 index 266d2fd..0000000 --- a/test/device/simt_plus_multiplies_ssrgemm_nt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_ssrgemm_tn_n_sm50.cu b/test/device/simt_plus_multiplies_ssrgemm_tn_n_sm50.cu deleted file mode 100644 index fa563e5..0000000 --- a/test/device/simt_plus_multiplies_ssrgemm_tn_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_ssrgemm_tn_t_sm50.cu b/test/device/simt_plus_multiplies_ssrgemm_tn_t_sm50.cu deleted file mode 100644 index e15d682..0000000 --- a/test/device/simt_plus_multiplies_ssrgemm_tn_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_ssrgemm_tt_n_sm50.cu b/test/device/simt_plus_multiplies_ssrgemm_tt_n_sm50.cu deleted file mode 100644 index ddf8323..0000000 --- a/test/device/simt_plus_multiplies_ssrgemm_tt_n_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::ColumnMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/simt_plus_multiplies_ssrgemm_tt_t_sm50.cu b/test/device/simt_plus_multiplies_ssrgemm_tt_t_sm50.cu deleted file mode 100644 index 1cacc99..0000000 --- a/test/device/simt_plus_multiplies_ssrgemm_tt_t_sm50.cu +++ /dev/null @@ -1,2077 +0,0 @@ -/*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). -**************************************************************************************************/ -///////////////////////////////////////////////////////////////// -// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // -///////////////////////////////////////////////////////////////// - -#include "gtest/gtest.h" - -/// from upstream cutlass -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/threadblock/threadblock_swizzle.h" - -/// from cuasr lib -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" -#include "cuasr/functional.h" - -/// from cuasr tools -#include "cuasr/reference/srgemm/host_srgemm.h" - -/// from local test dir -#include "testbed.h" - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 1 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 8 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 16 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 1 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 1 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 1 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 1 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 16 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 2 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 16 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 32 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 2 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 2 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 32 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 2 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 32 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 8 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 2 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 64 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 32 x 128 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 2 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 8 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 64 x 256 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; - using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 2 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 32 x 16 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; - using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 4 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 4 x 8 -// Warps / Block: 4 x 4 -// Threadblock: 128 x 128 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; - using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - -//////////////////////////////////////////////////////////////////////////////// -// Elements / Thread: 8 x 4 -// Threads / Warp: 8 x 4 -// Warps / Block: 4 x 4 -// Threadblock: 256 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_ssrgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { - using precision = float; - using OpClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - - using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; - using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; - using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; - - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; - using EpilogueOutputOp = Config::EpilogueOutputOp; - - using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, cutlass::layout::RowMajor, // - precision, OpClass, SmArch, // - ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; - - EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); -} -#endif - diff --git a/test/device/sm50_defaults.cu b/test/device/sm50_defaults.cu new file mode 100644 index 0000000..6538eda --- /dev/null +++ b/test/device/sm50_defaults.cu @@ -0,0 +1,3481 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_plus_mult_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_plus_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_plus_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_max_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_min_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_min_mult_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_max_mult_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_default_or_and_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + diff --git a/test/device/simt_minimum_maximum_ssrgemm_nn_n_sm50.cu b/test/device/sm50_simt_max_min_f32_srgemm_nn_n.cu similarity index 76% rename from test/device/simt_minimum_maximum_ssrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_max_min_f32_srgemm_nn_n.cu index e283b64..8637ee5 100644 --- a/test/device/simt_minimum_maximum_ssrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_max_min_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/sm50_simt_max_min_f32_srgemm_nn_t.cu b/test/device/sm50_simt_max_min_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..fd6e588 --- /dev/null +++ b/test/device/sm50_simt_max_min_f32_srgemm_nn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_min_f32_srgemm_nt_n.cu b/test/device/sm50_simt_max_min_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..d656b0e --- /dev/null +++ b/test/device/sm50_simt_max_min_f32_srgemm_nt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_min_f32_srgemm_nt_t.cu b/test/device/sm50_simt_max_min_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..ec086c4 --- /dev/null +++ b/test/device/sm50_simt_max_min_f32_srgemm_nt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_nt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_min_f32_srgemm_tn_n.cu b/test/device/sm50_simt_max_min_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..e969bc8 --- /dev/null +++ b/test/device/sm50_simt_max_min_f32_srgemm_tn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_maximum_plus_ssrgemm_tn_t_sm50.cu b/test/device/sm50_simt_max_min_f32_srgemm_tn_t.cu similarity index 76% rename from test/device/simt_maximum_plus_ssrgemm_tn_t_sm50.cu rename to test/device/sm50_simt_max_min_f32_srgemm_tn_t.cu index d8e96ac..ba81827 100644 --- a/test/device/simt_maximum_plus_ssrgemm_tn_t_sm50.cu +++ b/test/device/sm50_simt_max_min_f32_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/sm50_simt_max_min_f32_srgemm_tt_n.cu b/test/device/sm50_simt_max_min_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..9646611 --- /dev/null +++ b/test/device/sm50_simt_max_min_f32_srgemm_tt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_maximum_minimum_ssrgemm_tt_t_sm50.cu b/test/device/sm50_simt_max_min_f32_srgemm_tt_t.cu similarity index 76% rename from test/device/simt_maximum_minimum_ssrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_max_min_f32_srgemm_tt_t.cu index 22b2041..bc07532 100644 --- a/test/device/simt_maximum_minimum_ssrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_max_min_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f32_srgemm_tt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_min_f32_srgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f32_srgemm_tt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/sm50_simt_max_min_f64_srgemm_nn_n.cu b/test/device/sm50_simt_max_min_f64_srgemm_nn_n.cu new file mode 100644 index 0000000..9372aee --- /dev/null +++ b/test/device/sm50_simt_max_min_f64_srgemm_nn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_plus_multiplies_dsrgemm_tn_n_sm50.cu b/test/device/sm50_simt_max_min_f64_srgemm_nn_t.cu similarity index 70% rename from test/device/simt_plus_multiplies_dsrgemm_tn_n_sm50.cu rename to test/device/sm50_simt_max_min_f64_srgemm_nn_t.cu index 0ac35ad..ef52e64 100644 --- a/test/device/simt_plus_multiplies_dsrgemm_tn_n_sm50.cu +++ b/test/device/sm50_simt_max_min_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,62 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +974,62 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +1044,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1037,28 +1079,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1114,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1149,97 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1254,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1289,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1324,27 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1359,62 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1429,62 @@ TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_plus_multiplies_dsrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_min_f64_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::plus, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/sm50_simt_max_min_f64_srgemm_nt_n.cu b/test/device/sm50_simt_max_min_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..8a8b3eb --- /dev/null +++ b/test/device/sm50_simt_max_min_f64_srgemm_nt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_min_f64_srgemm_nt_t.cu b/test/device/sm50_simt_max_min_f64_srgemm_nt_t.cu new file mode 100644 index 0000000..171ad01 --- /dev/null +++ b/test/device/sm50_simt_max_min_f64_srgemm_nt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_min_f64_srgemm_tn_n.cu b/test/device/sm50_simt_max_min_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..89540b4 --- /dev/null +++ b/test/device/sm50_simt_max_min_f64_srgemm_tn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_binary_or_binary_and_dsrgemm_nt_t_sm50.cu b/test/device/sm50_simt_max_min_f64_srgemm_tn_t.cu similarity index 70% rename from test/device/simt_binary_or_binary_and_dsrgemm_nt_t_sm50.cu rename to test/device/sm50_simt_max_min_f64_srgemm_tn_t.cu index 4c21b98..dcc9368 100644 --- a/test/device/simt_binary_or_binary_and_dsrgemm_nt_t_sm50.cu +++ b/test/device/sm50_simt_max_min_f64_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,27 +29,26 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -65,27 +64,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -101,27 +99,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -137,27 +134,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -173,27 +169,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,27 +204,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -245,27 +239,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -281,27 +274,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -317,27 +309,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -353,27 +344,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -389,27 +379,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -425,27 +414,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -461,27 +449,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -497,27 +484,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -533,27 +519,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -569,27 +554,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -605,27 +589,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -641,27 +624,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -677,27 +659,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -713,27 +694,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -749,27 +729,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -785,27 +764,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -821,27 +799,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -857,27 +834,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -893,27 +869,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -929,27 +904,61 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -965,27 +974,61 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1001,27 +1044,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1037,27 +1079,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1073,27 +1114,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,27 +1149,96 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1145,27 +1254,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1181,27 +1289,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1217,27 +1324,26 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1253,27 +1359,61 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1289,27 +1429,61 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // diff --git a/test/device/simt_maximum_multiplies_dsrgemm_nt_t_sm50.cu b/test/device/sm50_simt_max_min_f64_srgemm_tt_n.cu similarity index 70% rename from test/device/simt_maximum_multiplies_dsrgemm_nt_t_sm50.cu rename to test/device/sm50_simt_max_min_f64_srgemm_tt_n.cu index 84c4fc5..960a6fb 100644 --- a/test/device/simt_maximum_multiplies_dsrgemm_nt_t_sm50.cu +++ b/test/device/sm50_simt_max_min_f64_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,62 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +974,62 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +1044,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1037,28 +1079,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1114,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1149,97 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1254,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1289,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1324,27 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1359,62 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1429,62 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_minimum_multiplies_dsrgemm_tt_t_sm50.cu b/test/device/sm50_simt_max_min_f64_srgemm_tt_t.cu similarity index 70% rename from test/device/simt_minimum_multiplies_dsrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_max_min_f64_srgemm_tt_t.cu index 4cd90e9..59f91bf 100644 --- a/test/device/simt_minimum_multiplies_dsrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_max_min_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_min_f64_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_min_f64_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_multiplies_dsrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_min_f64_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_min; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_min_f64_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_min; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/simt_maximum_plus_ssrgemm_nn_n_sm50.cu b/test/device/sm50_simt_max_mult_f32_srgemm_nn_n.cu similarity index 76% rename from test/device/simt_maximum_plus_ssrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_max_mult_f32_srgemm_nn_n.cu index 0289e86..0fa62a4 100644 --- a/test/device/simt_maximum_plus_ssrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_max_mult_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_nn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/simt_maximum_plus_ssrgemm_tn_n_sm50.cu b/test/device/sm50_simt_max_mult_f32_srgemm_nn_t.cu similarity index 76% rename from test/device/simt_maximum_plus_ssrgemm_tn_n_sm50.cu rename to test/device/sm50_simt_max_mult_f32_srgemm_nn_t.cu index 84e6fa7..0537a44 100644 --- a/test/device/simt_maximum_plus_ssrgemm_tn_n_sm50.cu +++ b/test/device/sm50_simt_max_mult_f32_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +939,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +974,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1036,29 +1008,28 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1044,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1079,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1114,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1149,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1184,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1219,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1254,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1325,28 +1289,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1361,28 +1324,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1397,28 +1359,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1433,28 +1394,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1469,28 +1429,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1505,28 +1464,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1541,28 +1499,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1576,29 +1533,28 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1613,28 +1569,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1649,28 +1604,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1685,28 +1639,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1721,28 +1674,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1757,28 +1709,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1793,28 +1744,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1829,28 +1779,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1865,28 +1814,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1901,28 +1849,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1937,28 +1884,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1973,28 +1919,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,28 +1954,62 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2045,28 +2024,62 @@ TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_nn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_minimum_plus_ssrgemm_nn_t_sm50.cu b/test/device/sm50_simt_max_mult_f32_srgemm_nt_n.cu similarity index 76% rename from test/device/simt_minimum_plus_ssrgemm_nn_t_sm50.cu rename to test/device/sm50_simt_max_mult_f32_srgemm_nt_n.cu index df4cb7d..57c42e0 100644 --- a/test/device/simt_minimum_plus_ssrgemm_nn_t_sm50.cu +++ b/test/device/sm50_simt_max_mult_f32_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +939,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +974,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1036,29 +1008,28 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1044,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1079,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1114,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1149,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1184,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1219,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1254,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1325,28 +1289,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1361,28 +1324,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1397,28 +1359,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1433,28 +1394,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1469,28 +1429,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1505,28 +1464,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1541,28 +1499,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1576,29 +1533,28 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1613,28 +1569,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1649,28 +1604,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1685,28 +1639,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1721,28 +1674,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1757,28 +1709,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1793,28 +1744,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1829,28 +1779,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1865,28 +1814,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1901,28 +1849,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1937,28 +1884,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1973,28 +1919,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,28 +1954,62 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2045,28 +2024,62 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_nt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_minimum_plus_ssrgemm_nt_t_sm50.cu b/test/device/sm50_simt_max_mult_f32_srgemm_nt_t.cu similarity index 76% rename from test/device/simt_minimum_plus_ssrgemm_nt_t_sm50.cu rename to test/device/sm50_simt_max_mult_f32_srgemm_nt_t.cu index f470b8b..15e0842 100644 --- a/test/device/simt_minimum_plus_ssrgemm_nt_t_sm50.cu +++ b/test/device/sm50_simt_max_mult_f32_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_nt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/sm50_simt_max_mult_f32_srgemm_tn_n.cu b/test/device/sm50_simt_max_mult_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..fd21b65 --- /dev/null +++ b/test/device/sm50_simt_max_mult_f32_srgemm_tn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_maximum_plus_ssrgemm_nt_t_sm50.cu b/test/device/sm50_simt_max_mult_f32_srgemm_tn_t.cu similarity index 76% rename from test/device/simt_maximum_plus_ssrgemm_nt_t_sm50.cu rename to test/device/sm50_simt_max_mult_f32_srgemm_tn_t.cu index b0110d6..a3f267a 100644 --- a/test/device/simt_maximum_plus_ssrgemm_nt_t_sm50.cu +++ b/test/device/sm50_simt_max_mult_f32_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,27 +29,26 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -65,27 +64,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -101,27 +99,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -137,27 +134,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -173,27 +169,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,27 +204,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -245,27 +239,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -281,27 +274,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -317,27 +309,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -353,27 +344,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -389,27 +379,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -425,27 +414,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -461,27 +449,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -497,27 +484,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -533,27 +519,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -569,27 +554,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -605,27 +589,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -641,27 +624,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -677,27 +659,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -713,27 +694,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -749,27 +729,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -785,27 +764,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -821,27 +799,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -857,27 +834,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -893,27 +869,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -929,27 +904,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -965,27 +939,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1001,27 +974,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1036,28 +1008,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1073,27 +1044,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,27 +1079,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1145,27 +1114,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1181,27 +1149,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1217,27 +1184,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1253,27 +1219,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1289,27 +1254,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1325,27 +1289,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1361,27 +1324,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1397,27 +1359,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1433,27 +1394,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1469,27 +1429,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1505,27 +1464,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1541,27 +1499,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1576,28 +1533,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1613,27 +1569,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1649,27 +1604,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1685,27 +1639,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1721,27 +1674,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1757,27 +1709,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1793,27 +1744,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1829,27 +1779,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1865,27 +1814,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1901,27 +1849,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1937,27 +1884,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1973,27 +1919,26 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2009,27 +1954,61 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -2045,27 +2024,61 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // diff --git a/test/device/sm50_simt_max_mult_f32_srgemm_tt_n.cu b/test/device/sm50_simt_max_mult_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..951cb5b --- /dev/null +++ b/test/device/sm50_simt_max_mult_f32_srgemm_tt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_maximum_plus_ssrgemm_tt_t_sm50.cu b/test/device/sm50_simt_max_mult_f32_srgemm_tt_t.cu similarity index 76% rename from test/device/simt_maximum_plus_ssrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_max_mult_f32_srgemm_tt_t.cu index 3fbaaa1..4e71eb6 100644 --- a/test/device/simt_maximum_plus_ssrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_max_mult_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f32_srgemm_tt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/sm50_simt_max_mult_f64_srgemm_nn_n.cu b/test/device/sm50_simt_max_mult_f64_srgemm_nn_n.cu new file mode 100644 index 0000000..eb3f21f --- /dev/null +++ b/test/device/sm50_simt_max_mult_f64_srgemm_nn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_mult_f64_srgemm_nn_t.cu b/test/device/sm50_simt_max_mult_f64_srgemm_nn_t.cu new file mode 100644 index 0000000..8c41c96 --- /dev/null +++ b/test/device/sm50_simt_max_mult_f64_srgemm_nn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_mult_f64_srgemm_nt_n.cu b/test/device/sm50_simt_max_mult_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..b831f57 --- /dev/null +++ b/test/device/sm50_simt_max_mult_f64_srgemm_nt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_maximum_plus_dsrgemm_nt_t_sm50.cu b/test/device/sm50_simt_max_mult_f64_srgemm_nt_t.cu similarity index 70% rename from test/device/simt_maximum_plus_dsrgemm_nt_t_sm50.cu rename to test/device/sm50_simt_max_mult_f64_srgemm_nt_t.cu index 8b5e817..fef5cc7 100644 --- a/test/device/simt_maximum_plus_dsrgemm_nt_t_sm50.cu +++ b/test/device/sm50_simt_max_mult_f64_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/sm50_simt_max_mult_f64_srgemm_tn_n.cu b/test/device/sm50_simt_max_mult_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..88c4363 --- /dev/null +++ b/test/device/sm50_simt_max_mult_f64_srgemm_tn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_mult_f64_srgemm_tn_t.cu b/test/device/sm50_simt_max_mult_f64_srgemm_tn_t.cu new file mode 100644 index 0000000..1fc1823 --- /dev/null +++ b/test/device/sm50_simt_max_mult_f64_srgemm_tn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_mult_f64_srgemm_tt_n.cu b/test/device/sm50_simt_max_mult_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..dd2eb20 --- /dev/null +++ b/test/device/sm50_simt_max_mult_f64_srgemm_tt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_binary_or_binary_and_dsrgemm_tt_t_sm50.cu b/test/device/sm50_simt_max_mult_f64_srgemm_tt_t.cu similarity index 70% rename from test/device/simt_binary_or_binary_and_dsrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_max_mult_f64_srgemm_tt_t.cu index 006cac2..96207f1 100644 --- a/test/device/simt_binary_or_binary_and_dsrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_max_mult_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_mult_f64_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/simt_maximum_minimum_ssrgemm_nn_n_sm50.cu b/test/device/sm50_simt_max_plus_f32_srgemm_nn_n.cu similarity index 76% rename from test/device/simt_maximum_minimum_ssrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_max_plus_f32_srgemm_nn_n.cu index 0dbd857..fd8a37d 100644 --- a/test/device/simt_maximum_minimum_ssrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_max_plus_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_ssrgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/sm50_simt_max_plus_f32_srgemm_nn_t.cu b/test/device/sm50_simt_max_plus_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..9d9cee3 --- /dev/null +++ b/test/device/sm50_simt_max_plus_f32_srgemm_nn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f32_srgemm_nt_n.cu b/test/device/sm50_simt_max_plus_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..528fd8b --- /dev/null +++ b/test/device/sm50_simt_max_plus_f32_srgemm_nt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f32_srgemm_nt_t.cu b/test/device/sm50_simt_max_plus_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..384a36d --- /dev/null +++ b/test/device/sm50_simt_max_plus_f32_srgemm_nt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_nt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f32_srgemm_tn_n.cu b/test/device/sm50_simt_max_plus_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..c80200a --- /dev/null +++ b/test/device/sm50_simt_max_plus_f32_srgemm_tn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f32_srgemm_tn_t.cu b/test/device/sm50_simt_max_plus_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..d0154e5 --- /dev/null +++ b/test/device/sm50_simt_max_plus_f32_srgemm_tn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f32_srgemm_tt_n.cu b/test/device/sm50_simt_max_plus_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..b1f654f --- /dev/null +++ b/test/device/sm50_simt_max_plus_f32_srgemm_tt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_minimum_maximum_ssrgemm_tt_t_sm50.cu b/test/device/sm50_simt_max_plus_f32_srgemm_tt_t.cu similarity index 76% rename from test/device/simt_minimum_maximum_ssrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_max_plus_f32_srgemm_tt_t.cu index af191f7..3c796fc 100644 --- a/test/device/simt_minimum_maximum_ssrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_max_plus_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_maximum_ssrgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::maximum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f32_srgemm_tt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/simt_maximum_minimum_dsrgemm_nn_n_sm50.cu b/test/device/sm50_simt_max_plus_f64_srgemm_nn_n.cu similarity index 70% rename from test/device/simt_maximum_minimum_dsrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_max_plus_f64_srgemm_nn_n.cu index 5946f66..b0bea0f 100644 --- a/test/device/simt_maximum_minimum_dsrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_max_plus_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_minimum_dsrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::max_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::minimum, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/sm50_simt_max_plus_f64_srgemm_nn_t.cu b/test/device/sm50_simt_max_plus_f64_srgemm_nn_t.cu new file mode 100644 index 0000000..ff232e6 --- /dev/null +++ b/test/device/sm50_simt_max_plus_f64_srgemm_nn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f64_srgemm_nt_n.cu b/test/device/sm50_simt_max_plus_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..bb70fba --- /dev/null +++ b/test/device/sm50_simt_max_plus_f64_srgemm_nt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f64_srgemm_nt_t.cu b/test/device/sm50_simt_max_plus_f64_srgemm_nt_t.cu new file mode 100644 index 0000000..f95a20b --- /dev/null +++ b/test/device/sm50_simt_max_plus_f64_srgemm_nt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f64_srgemm_tn_n.cu b/test/device/sm50_simt_max_plus_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..caf6a18 --- /dev/null +++ b/test/device/sm50_simt_max_plus_f64_srgemm_tn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f64_srgemm_tn_t.cu b/test/device/sm50_simt_max_plus_f64_srgemm_tn_t.cu new file mode 100644 index 0000000..2dbade9 --- /dev/null +++ b/test/device/sm50_simt_max_plus_f64_srgemm_tn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f64_srgemm_tt_n.cu b/test/device/sm50_simt_max_plus_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..b619c4d --- /dev/null +++ b/test/device/sm50_simt_max_plus_f64_srgemm_tt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_max_plus_f64_srgemm_tt_t.cu b/test/device/sm50_simt_max_plus_f64_srgemm_tt_t.cu new file mode 100644 index 0000000..69f021d --- /dev/null +++ b/test/device/sm50_simt_max_plus_f64_srgemm_tt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_max_plus_f64_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::max_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f32_srgemm_nn_n.cu b/test/device/sm50_simt_min_max_f32_srgemm_nn_n.cu new file mode 100644 index 0000000..9f0695f --- /dev/null +++ b/test/device/sm50_simt_min_max_f32_srgemm_nn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f32_srgemm_nn_t.cu b/test/device/sm50_simt_min_max_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..1f54fcd --- /dev/null +++ b/test/device/sm50_simt_min_max_f32_srgemm_nn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f32_srgemm_nt_n.cu b/test/device/sm50_simt_min_max_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..f890981 --- /dev/null +++ b/test/device/sm50_simt_min_max_f32_srgemm_nt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f32_srgemm_nt_t.cu b/test/device/sm50_simt_min_max_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..c006ae3 --- /dev/null +++ b/test/device/sm50_simt_min_max_f32_srgemm_nt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_nt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f32_srgemm_tn_n.cu b/test/device/sm50_simt_min_max_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..45c514c --- /dev/null +++ b/test/device/sm50_simt_min_max_f32_srgemm_tn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f32_srgemm_tn_t.cu b/test/device/sm50_simt_min_max_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..44d9351 --- /dev/null +++ b/test/device/sm50_simt_min_max_f32_srgemm_tn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f32_srgemm_tt_n.cu b/test/device/sm50_simt_min_max_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..b48caf4 --- /dev/null +++ b/test/device/sm50_simt_min_max_f32_srgemm_tt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f32_srgemm_tt_t.cu b/test/device/sm50_simt_min_max_f32_srgemm_tt_t.cu new file mode 100644 index 0000000..712a4e5 --- /dev/null +++ b/test/device/sm50_simt_min_max_f32_srgemm_tt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f32_srgemm_tt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_maximum_multiplies_dsrgemm_nn_n_sm50.cu b/test/device/sm50_simt_min_max_f64_srgemm_nn_n.cu similarity index 70% rename from test/device/simt_maximum_multiplies_dsrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_min_max_f64_srgemm_nn_n.cu index 9412254..994a5ae 100644 --- a/test/device/simt_maximum_multiplies_dsrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_min_max_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_min_max_f64_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_max; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/sm50_simt_min_max_f64_srgemm_nn_t.cu b/test/device/sm50_simt_min_max_f64_srgemm_nn_t.cu new file mode 100644 index 0000000..9ac3269 --- /dev/null +++ b/test/device/sm50_simt_min_max_f64_srgemm_nn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f64_srgemm_nt_n.cu b/test/device/sm50_simt_min_max_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..d2541a9 --- /dev/null +++ b/test/device/sm50_simt_min_max_f64_srgemm_nt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f64_srgemm_nt_t.cu b/test/device/sm50_simt_min_max_f64_srgemm_nt_t.cu new file mode 100644 index 0000000..79c0df9 --- /dev/null +++ b/test/device/sm50_simt_min_max_f64_srgemm_nt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f64_srgemm_tn_n.cu b/test/device/sm50_simt_min_max_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..8762326 --- /dev/null +++ b/test/device/sm50_simt_min_max_f64_srgemm_tn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f64_srgemm_tn_t.cu b/test/device/sm50_simt_min_max_f64_srgemm_tn_t.cu new file mode 100644 index 0000000..7e908ea --- /dev/null +++ b/test/device/sm50_simt_min_max_f64_srgemm_tn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f64_srgemm_tt_n.cu b/test/device/sm50_simt_min_max_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..7af8682 --- /dev/null +++ b/test/device/sm50_simt_min_max_f64_srgemm_tt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_max_f64_srgemm_tt_t.cu b/test/device/sm50_simt_min_max_f64_srgemm_tt_t.cu new file mode 100644 index 0000000..562ed50 --- /dev/null +++ b/test/device/sm50_simt_min_max_f64_srgemm_tt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_max_f64_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_max; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f32_srgemm_nn_n.cu b/test/device/sm50_simt_min_mult_f32_srgemm_nn_n.cu new file mode 100644 index 0000000..facc975 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f32_srgemm_nn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f32_srgemm_nn_t.cu b/test/device/sm50_simt_min_mult_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..d954691 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f32_srgemm_nn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f32_srgemm_nt_n.cu b/test/device/sm50_simt_min_mult_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..642aefe --- /dev/null +++ b/test/device/sm50_simt_min_mult_f32_srgemm_nt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f32_srgemm_nt_t.cu b/test/device/sm50_simt_min_mult_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..7c88cee --- /dev/null +++ b/test/device/sm50_simt_min_mult_f32_srgemm_nt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_nt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f32_srgemm_tn_n.cu b/test/device/sm50_simt_min_mult_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..ca28251 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f32_srgemm_tn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f32_srgemm_tn_t.cu b/test/device/sm50_simt_min_mult_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..d385aa3 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f32_srgemm_tn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f32_srgemm_tt_n.cu b/test/device/sm50_simt_min_mult_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..7cf9d37 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f32_srgemm_tt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f32_srgemm_tt_t.cu b/test/device/sm50_simt_min_mult_f32_srgemm_tt_t.cu new file mode 100644 index 0000000..67fc5a1 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f32_srgemm_tt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f32_srgemm_tt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f64_srgemm_nn_n.cu b/test/device/sm50_simt_min_mult_f64_srgemm_nn_n.cu new file mode 100644 index 0000000..681d229 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f64_srgemm_nn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f64_srgemm_nn_t.cu b/test/device/sm50_simt_min_mult_f64_srgemm_nn_t.cu new file mode 100644 index 0000000..9555610 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f64_srgemm_nn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f64_srgemm_nt_n.cu b/test/device/sm50_simt_min_mult_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..b07acc7 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f64_srgemm_nt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f64_srgemm_nt_t.cu b/test/device/sm50_simt_min_mult_f64_srgemm_nt_t.cu new file mode 100644 index 0000000..fc03dfa --- /dev/null +++ b/test/device/sm50_simt_min_mult_f64_srgemm_nt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f64_srgemm_tn_n.cu b/test/device/sm50_simt_min_mult_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..898dadc --- /dev/null +++ b/test/device/sm50_simt_min_mult_f64_srgemm_tn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f64_srgemm_tn_t.cu b/test/device/sm50_simt_min_mult_f64_srgemm_tn_t.cu new file mode 100644 index 0000000..e4d009a --- /dev/null +++ b/test/device/sm50_simt_min_mult_f64_srgemm_tn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_mult_f64_srgemm_tt_n.cu b/test/device/sm50_simt_min_mult_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..4187923 --- /dev/null +++ b/test/device/sm50_simt_min_mult_f64_srgemm_tt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_minimum_plus_dsrgemm_tt_t_sm50.cu b/test/device/sm50_simt_min_mult_f64_srgemm_tt_t.cu similarity index 70% rename from test/device/simt_minimum_plus_dsrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_min_mult_f64_srgemm_tt_t.cu index c452881..259a9b2 100644 --- a/test/device/simt_minimum_plus_dsrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_min_mult_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_mult_f64_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/sm50_simt_min_plus_f32_srgemm_nn_n.cu b/test/device/sm50_simt_min_plus_f32_srgemm_nn_n.cu new file mode 100644 index 0000000..873608a --- /dev/null +++ b/test/device/sm50_simt_min_plus_f32_srgemm_nn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f32_srgemm_nn_t.cu b/test/device/sm50_simt_min_plus_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..7381401 --- /dev/null +++ b/test/device/sm50_simt_min_plus_f32_srgemm_nn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f32_srgemm_nt_n.cu b/test/device/sm50_simt_min_plus_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..cfb39b5 --- /dev/null +++ b/test/device/sm50_simt_min_plus_f32_srgemm_nt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f32_srgemm_nt_t.cu b/test/device/sm50_simt_min_plus_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..151dc2d --- /dev/null +++ b/test/device/sm50_simt_min_plus_f32_srgemm_nt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_nt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f32_srgemm_tn_n.cu b/test/device/sm50_simt_min_plus_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..7428f68 --- /dev/null +++ b/test/device/sm50_simt_min_plus_f32_srgemm_tn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f32_srgemm_tn_t.cu b/test/device/sm50_simt_min_plus_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..aeda82c --- /dev/null +++ b/test/device/sm50_simt_min_plus_f32_srgemm_tn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f32_srgemm_tt_n.cu b/test/device/sm50_simt_min_plus_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..a4e35ee --- /dev/null +++ b/test/device/sm50_simt_min_plus_f32_srgemm_tt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f32_srgemm_tt_t.cu b/test/device/sm50_simt_min_plus_f32_srgemm_tt_t.cu new file mode 100644 index 0000000..de145bd --- /dev/null +++ b/test/device/sm50_simt_min_plus_f32_srgemm_tt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f32_srgemm_tt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_maximum_plus_dsrgemm_nn_n_sm50.cu b/test/device/sm50_simt_min_plus_f64_srgemm_nn_n.cu similarity index 70% rename from test/device/simt_maximum_plus_dsrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_min_plus_f64_srgemm_nn_n.cu index 4a1c313..4ce6ec6 100644 --- a/test/device/simt_maximum_plus_dsrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_min_plus_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/simt_maximum_plus_dsrgemm_nn_t_sm50.cu b/test/device/sm50_simt_min_plus_f64_srgemm_nn_t.cu similarity index 70% rename from test/device/simt_maximum_plus_dsrgemm_nn_t_sm50.cu rename to test/device/sm50_simt_min_plus_f64_srgemm_nn_t.cu index 2aa813e..50e745d 100644 --- a/test/device/simt_maximum_plus_dsrgemm_nn_t_sm50.cu +++ b/test/device/sm50_simt_min_plus_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::min_plus; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/sm50_simt_min_plus_f64_srgemm_nt_n.cu b/test/device/sm50_simt_min_plus_f64_srgemm_nt_n.cu new file mode 100644 index 0000000..3bc61eb --- /dev/null +++ b/test/device/sm50_simt_min_plus_f64_srgemm_nt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f64_srgemm_nt_t.cu b/test/device/sm50_simt_min_plus_f64_srgemm_nt_t.cu new file mode 100644 index 0000000..a6c1579 --- /dev/null +++ b/test/device/sm50_simt_min_plus_f64_srgemm_nt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f64_srgemm_tn_n.cu b/test/device/sm50_simt_min_plus_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..368af1f --- /dev/null +++ b/test/device/sm50_simt_min_plus_f64_srgemm_tn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f64_srgemm_tn_t.cu b/test/device/sm50_simt_min_plus_f64_srgemm_tn_t.cu new file mode 100644 index 0000000..7675fbd --- /dev/null +++ b/test/device/sm50_simt_min_plus_f64_srgemm_tn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f64_srgemm_tt_n.cu b/test/device/sm50_simt_min_plus_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..7706559 --- /dev/null +++ b/test/device/sm50_simt_min_plus_f64_srgemm_tt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_min_plus_f64_srgemm_tt_t.cu b/test/device/sm50_simt_min_plus_f64_srgemm_tt_t.cu new file mode 100644 index 0000000..efbfc67 --- /dev/null +++ b/test/device/sm50_simt_min_plus_f64_srgemm_tt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_min_plus_f64_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::min_plus; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f32_srgemm_nn_n.cu b/test/device/sm50_simt_or_and_f32_srgemm_nn_n.cu new file mode 100644 index 0000000..990e33f --- /dev/null +++ b/test/device/sm50_simt_or_and_f32_srgemm_nn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f32_srgemm_nn_t.cu b/test/device/sm50_simt_or_and_f32_srgemm_nn_t.cu new file mode 100644 index 0000000..9824d07 --- /dev/null +++ b/test/device/sm50_simt_or_and_f32_srgemm_nn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f32_srgemm_nt_n.cu b/test/device/sm50_simt_or_and_f32_srgemm_nt_n.cu new file mode 100644 index 0000000..7fd6a7a --- /dev/null +++ b/test/device/sm50_simt_or_and_f32_srgemm_nt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f32_srgemm_nt_t.cu b/test/device/sm50_simt_or_and_f32_srgemm_nt_t.cu new file mode 100644 index 0000000..6aacd7b --- /dev/null +++ b/test/device/sm50_simt_or_and_f32_srgemm_nt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_nt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f32_srgemm_tn_n.cu b/test/device/sm50_simt_or_and_f32_srgemm_tn_n.cu new file mode 100644 index 0000000..2cbd50f --- /dev/null +++ b/test/device/sm50_simt_or_and_f32_srgemm_tn_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f32_srgemm_tn_t.cu b/test/device/sm50_simt_or_and_f32_srgemm_tn_t.cu new file mode 100644 index 0000000..75f7a24 --- /dev/null +++ b/test/device/sm50_simt_or_and_f32_srgemm_tn_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f32_srgemm_tt_n.cu b/test/device/sm50_simt_or_and_f32_srgemm_tt_n.cu new file mode 100644 index 0000000..7ae6449 --- /dev/null +++ b/test/device/sm50_simt_or_and_f32_srgemm_tt_n.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f32_srgemm_tt_t.cu b/test/device/sm50_simt_or_and_f32_srgemm_tt_t.cu new file mode 100644 index 0000000..d0cbeed --- /dev/null +++ b/test/device/sm50_simt_or_and_f32_srgemm_tt_t.cu @@ -0,0 +1,2090 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 1 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f32_srgemm_tt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_binary_or_binary_and_dsrgemm_nn_n_sm50.cu b/test/device/sm50_simt_or_and_f64_srgemm_nn_n.cu similarity index 70% rename from test/device/simt_binary_or_binary_and_dsrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_or_and_f64_srgemm_nn_n.cu index 6e0e06d..7ac9109 100644 --- a/test/device/simt_binary_or_binary_and_dsrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_or_and_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2 // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2 // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2 // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4 // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4 // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2 // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4 // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_binary_or_binary_and_dsrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::binary_or, cuasr::binary_and, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/simt_minimum_plus_dsrgemm_nn_t_sm50.cu b/test/device/sm50_simt_or_and_f64_srgemm_nn_t.cu similarity index 70% rename from test/device/simt_minimum_plus_dsrgemm_nn_t_sm50.cu rename to test/device/sm50_simt_or_and_f64_srgemm_nn_t.cu index d2f5b82..28126b1 100644 --- a/test/device/simt_minimum_plus_dsrgemm_nn_t_sm50.cu +++ b/test/device/sm50_simt_or_and_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/simt_minimum_plus_dsrgemm_tn_n_sm50.cu b/test/device/sm50_simt_or_and_f64_srgemm_nt_n.cu similarity index 70% rename from test/device/simt_minimum_plus_dsrgemm_tn_n_sm50.cu rename to test/device/sm50_simt_or_and_f64_srgemm_nt_n.cu index ead0b99..12857da 100644 --- a/test/device/simt_minimum_plus_dsrgemm_tn_n_sm50.cu +++ b/test/device/sm50_simt_or_and_f64_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,27 +29,26 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -65,27 +64,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -101,27 +99,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -137,27 +134,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -173,27 +169,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,27 +204,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -245,27 +239,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -281,27 +274,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -317,27 +309,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -353,27 +344,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -389,27 +379,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -425,27 +414,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -461,27 +449,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -497,27 +484,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -533,27 +519,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -569,27 +554,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -605,27 +589,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -641,27 +624,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -677,27 +659,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -713,27 +694,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -749,27 +729,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -785,27 +764,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -821,27 +799,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -857,27 +834,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -893,27 +869,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -929,27 +904,61 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -965,27 +974,61 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1001,27 +1044,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1037,27 +1079,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1073,27 +1114,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,27 +1149,96 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1145,27 +1254,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1181,27 +1289,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1217,27 +1324,26 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1253,27 +1359,61 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1289,27 +1429,61 @@ TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_or_and_f64_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // diff --git a/test/device/sm50_simt_or_and_f64_srgemm_nt_t.cu b/test/device/sm50_simt_or_and_f64_srgemm_nt_t.cu new file mode 100644 index 0000000..9541a88 --- /dev/null +++ b/test/device/sm50_simt_or_and_f64_srgemm_nt_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f64_srgemm_tn_n.cu b/test/device/sm50_simt_or_and_f64_srgemm_tn_n.cu new file mode 100644 index 0000000..be37870 --- /dev/null +++ b/test/device/sm50_simt_or_and_f64_srgemm_tn_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f64_srgemm_tn_t.cu b/test/device/sm50_simt_or_and_f64_srgemm_tn_t.cu new file mode 100644 index 0000000..3a13589 --- /dev/null +++ b/test/device/sm50_simt_or_and_f64_srgemm_tn_t.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/sm50_simt_or_and_f64_srgemm_tt_n.cu b/test/device/sm50_simt_or_and_f64_srgemm_tt_n.cu new file mode 100644 index 0000000..4837d76 --- /dev/null +++ b/test/device/sm50_simt_or_and_f64_srgemm_tt_n.cu @@ -0,0 +1,1495 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 8 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 16 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 1 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 1 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 16 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 16 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 32 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 64 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 32 x 128 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 2 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 2 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 32 x 16 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + diff --git a/test/device/simt_maximum_multiplies_dsrgemm_tt_t_sm50.cu b/test/device/sm50_simt_or_and_f64_srgemm_tt_t.cu similarity index 70% rename from test/device/simt_maximum_multiplies_dsrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_or_and_f64_srgemm_tt_t.cu index 3a3af40..a6e7d45 100644 --- a/test/device/simt_maximum_multiplies_dsrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_or_and_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_or_and_f64_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_or_and_f64_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_multiplies_dsrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_or_and_f64_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::or_and; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::multiplies, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_or_and_f64_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::or_and; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/simt_minimum_plus_ssrgemm_nn_n_sm50.cu b/test/device/sm50_simt_plus_mult_f32_srgemm_nn_n.cu similarity index 77% rename from test/device/simt_minimum_plus_ssrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f32_srgemm_nn_n.cu index 1a19a7b..4eaa928 100644 --- a/test/device/simt_minimum_plus_ssrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f32_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_nn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/simt_minimum_plus_ssrgemm_nt_n_sm50.cu b/test/device/sm50_simt_plus_mult_f32_srgemm_nn_t.cu similarity index 77% rename from test/device/simt_minimum_plus_ssrgemm_nt_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f32_srgemm_nn_t.cu index cc265b1..bac06ca 100644 --- a/test/device/simt_minimum_plus_ssrgemm_nt_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f32_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +939,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +974,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1036,29 +1008,28 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1044,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1079,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1114,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1149,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1184,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1219,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1254,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1325,28 +1289,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1361,28 +1324,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1397,28 +1359,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1433,28 +1394,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1469,28 +1429,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1505,28 +1464,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1541,28 +1499,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1576,29 +1533,28 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1613,28 +1569,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1649,28 +1604,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1685,28 +1639,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1721,28 +1674,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1757,28 +1709,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1793,28 +1744,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1829,28 +1779,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1865,28 +1814,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1901,28 +1849,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1937,28 +1884,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1973,28 +1919,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,28 +1954,62 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2045,28 +2024,62 @@ TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_nn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_maximum_plus_ssrgemm_nn_t_sm50.cu b/test/device/sm50_simt_plus_mult_f32_srgemm_nt_n.cu similarity index 77% rename from test/device/simt_maximum_plus_ssrgemm_nn_t_sm50.cu rename to test/device/sm50_simt_plus_mult_f32_srgemm_nt_n.cu index 04d5adc..881be62 100644 --- a/test/device/simt_maximum_plus_ssrgemm_nn_t_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f32_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +939,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +974,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1036,29 +1008,28 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1044,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1079,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1114,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1149,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1184,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1219,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1254,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1325,28 +1289,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1361,28 +1324,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1397,28 +1359,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1433,28 +1394,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1469,28 +1429,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1505,28 +1464,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1541,28 +1499,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1576,29 +1533,28 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1613,28 +1569,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1649,28 +1604,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1685,28 +1639,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1721,28 +1674,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1757,28 +1709,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1793,28 +1744,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1829,28 +1779,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1865,28 +1814,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1901,28 +1849,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1937,28 +1884,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1973,28 +1919,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,28 +1954,62 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2045,28 +2024,62 @@ TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_nt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_minimum_plus_ssrgemm_tn_t_sm50.cu b/test/device/sm50_simt_plus_mult_f32_srgemm_nt_t.cu similarity index 77% rename from test/device/simt_minimum_plus_ssrgemm_tn_t_sm50.cu rename to test/device/sm50_simt_plus_mult_f32_srgemm_nt_t.cu index 8cd9fcf..d4b8335 100644 --- a/test/device/simt_minimum_plus_ssrgemm_tn_t_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f32_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +939,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +974,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1036,29 +1008,28 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1044,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1079,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1114,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1149,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1184,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1219,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1254,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1325,28 +1289,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1361,28 +1324,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1397,28 +1359,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1433,28 +1394,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1469,28 +1429,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1505,28 +1464,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1541,28 +1499,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1576,29 +1533,28 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1613,28 +1569,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1649,28 +1604,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1685,28 +1639,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1721,28 +1674,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1757,28 +1709,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1793,28 +1744,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1829,28 +1779,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1865,28 +1814,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1901,28 +1849,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1937,28 +1884,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1973,28 +1919,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,28 +1954,62 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2045,28 +2024,62 @@ TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_nt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_maximum_plus_ssrgemm_nt_n_sm50.cu b/test/device/sm50_simt_plus_mult_f32_srgemm_tn_n.cu similarity index 77% rename from test/device/simt_maximum_plus_ssrgemm_nt_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f32_srgemm_tn_n.cu index b49dd8c..b41a6fd 100644 --- a/test/device/simt_maximum_plus_ssrgemm_nt_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f32_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +939,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +974,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1036,29 +1008,28 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1044,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1079,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1114,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1149,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1184,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1219,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1254,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1325,28 +1289,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1361,28 +1324,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1397,28 +1359,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1433,28 +1394,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1469,28 +1429,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1505,28 +1464,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1541,28 +1499,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1576,29 +1533,28 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1613,28 +1569,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1649,28 +1604,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1685,28 +1639,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1721,28 +1674,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1757,28 +1709,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1793,28 +1744,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1829,28 +1779,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1865,28 +1814,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1901,28 +1849,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1937,28 +1884,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1973,28 +1919,27 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,28 +1954,62 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2045,28 +2024,62 @@ TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_nt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_tn_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_minimum_plus_ssrgemm_tt_n_sm50.cu b/test/device/sm50_simt_plus_mult_f32_srgemm_tn_t.cu similarity index 77% rename from test/device/simt_minimum_plus_ssrgemm_tt_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f32_srgemm_tn_t.cu index 0a6a561..ab7790b 100644 --- a/test/device/simt_minimum_plus_ssrgemm_tt_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f32_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +939,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +974,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1036,29 +1008,28 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1044,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1079,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1114,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1149,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1184,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1219,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1254,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1325,28 +1289,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1361,28 +1324,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1397,28 +1359,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1433,28 +1394,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1469,28 +1429,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1505,28 +1464,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1541,28 +1499,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1576,29 +1533,28 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1613,28 +1569,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1649,28 +1604,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1685,28 +1639,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1721,28 +1674,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1757,28 +1709,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1793,28 +1744,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1829,28 +1779,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1865,28 +1814,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1901,28 +1849,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1937,28 +1884,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1973,28 +1919,27 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2009,28 +1954,62 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -2045,28 +2024,62 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_tn_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_maximum_plus_ssrgemm_tt_n_sm50.cu b/test/device/sm50_simt_plus_mult_f32_srgemm_tt_n.cu similarity index 77% rename from test/device/simt_maximum_plus_ssrgemm_tt_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f32_srgemm_tt_n.cu index ef7f2df..b68b815 100644 --- a/test/device/simt_maximum_plus_ssrgemm_tt_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f32_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_ssrgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_tt_n, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/simt_minimum_plus_ssrgemm_tt_t_sm50.cu b/test/device/sm50_simt_plus_mult_f32_srgemm_tt_t.cu similarity index 77% rename from test/device/simt_minimum_plus_ssrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_plus_mult_f32_srgemm_tt_t.cu index 717dea4..98bb7ce 100644 --- a/test/device/simt_minimum_plus_ssrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f32_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x8_32x64x1_8x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x8_64x32x1_8x8_8x4_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x128x8_32x64x1_8x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_64x32x1_8x8_8x4_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_32x64x1_8x8_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x32x8_64x32x1_8x8_8x4_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +939,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +974,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x128x8_32x64x1_8x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1036,26 +1008,25 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Threads / Warp: 8 x 4 // Warps / Block: 2 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1044,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x64x8_64x32x1_8x8_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1079,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1114,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1149,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1184,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1219,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1254,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1325,25 +1289,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1361,25 +1324,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1397,25 +1359,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x256x8_32x64x1_8x8_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1433,25 +1394,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x128x8_64x32x1_8x8_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1469,25 +1429,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1505,25 +1464,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1541,25 +1499,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1576,26 +1533,25 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Threads / Warp: 4 x 8 // Warps / Block: 4 x 2 // Threadblock: 128 x 64 x 8 -#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1613,25 +1569,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1649,25 +1604,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x128x8_32x64x1_8x8_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1685,25 +1639,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { // Warps / Block: 4 x 2 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1721,25 +1674,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 256x64x8_64x32x1_8x8_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1757,25 +1709,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1793,25 +1744,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1829,25 +1779,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1865,25 +1814,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1901,25 +1849,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 256 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 256, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1937,25 +1884,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 64x256x8_16x64x1_4x8_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1973,25 +1919,24 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2009,25 +1954,59 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 128x256x8_32x64x1_8x8_4x8_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -2045,25 +2024,59 @@ TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 128x128x8_32x32x1_8x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 256 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_ssrgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 256x64x8_64x16x1_8x4_8x4_4x4) { using precision = float; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<256, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 8 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 256 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f32_srgemm_tt_t, 256x128x8_64x32x1_8x8_8x4_4x4) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/simt_minimum_plus_dsrgemm_nn_n_sm50.cu b/test/device/sm50_simt_plus_mult_f64_srgemm_nn_n.cu similarity index 71% rename from test/device/simt_minimum_plus_dsrgemm_nn_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f64_srgemm_nn_n.cu index 14b5dea..e7b4277 100644 --- a/test/device/simt_minimum_plus_dsrgemm_nn_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f64_srgemm_nn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // diff --git a/test/device/simt_maximum_plus_dsrgemm_nt_n_sm50.cu b/test/device/sm50_simt_plus_mult_f64_srgemm_nn_t.cu similarity index 71% rename from test/device/simt_maximum_plus_dsrgemm_nt_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f64_srgemm_nn_t.cu index 367652b..c2acb37 100644 --- a/test/device/simt_maximum_plus_dsrgemm_nt_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f64_srgemm_nn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +974,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +1044,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1037,28 +1079,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1114,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1149,97 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1254,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1289,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1324,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::ColumnMajor, // - precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1359,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1429,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_maximum_plus_dsrgemm_tn_n_sm50.cu b/test/device/sm50_simt_plus_mult_f64_srgemm_nt_n.cu similarity index 71% rename from test/device/simt_maximum_plus_dsrgemm_tn_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f64_srgemm_nt_n.cu index 078312e..664bbf8 100644 --- a/test/device/simt_maximum_plus_dsrgemm_tn_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f64_srgemm_nt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,27 +29,26 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -65,27 +64,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -101,27 +99,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -137,27 +134,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -173,27 +169,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -209,27 +204,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -245,27 +239,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -281,27 +274,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -317,27 +309,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -353,27 +344,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -389,27 +379,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -425,27 +414,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -461,27 +449,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -497,27 +484,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -533,27 +519,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -569,27 +554,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -605,27 +589,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -641,27 +624,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -677,27 +659,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -713,27 +694,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -749,27 +729,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -785,27 +764,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -821,27 +799,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -857,27 +834,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -893,27 +869,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -929,27 +904,61 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -965,27 +974,61 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1001,27 +1044,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1037,27 +1079,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1073,27 +1114,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1109,27 +1149,96 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1145,27 +1254,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1181,27 +1289,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1217,27 +1324,26 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1253,27 +1359,61 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // @@ -1289,27 +1429,61 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // diff --git a/test/device/simt_maximum_plus_dsrgemm_tn_t_sm50.cu b/test/device/sm50_simt_plus_mult_f64_srgemm_nt_t.cu similarity index 71% rename from test/device/simt_maximum_plus_dsrgemm_tn_t_sm50.cu rename to test/device/sm50_simt_plus_mult_f64_srgemm_nt_t.cu index fb2156a..01443e6 100644 --- a/test/device/simt_maximum_plus_dsrgemm_tn_t_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f64_srgemm_nt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +974,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +1044,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1037,28 +1079,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1114,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1149,97 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1254,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1289,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1324,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1359,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1429,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_nt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_minimum_plus_dsrgemm_nt_n_sm50.cu b/test/device/sm50_simt_plus_mult_f64_srgemm_tn_n.cu similarity index 71% rename from test/device/simt_minimum_plus_dsrgemm_nt_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f64_srgemm_tn_n.cu index 8263aae..1229881 100644 --- a/test/device/simt_minimum_plus_dsrgemm_nt_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f64_srgemm_tn_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,62 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +974,62 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +1044,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1037,28 +1079,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1114,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1149,97 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1254,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1289,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1324,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1359,62 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1429,62 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tn_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_maximum_plus_dsrgemm_tt_n_sm50.cu b/test/device/sm50_simt_plus_mult_f64_srgemm_tn_t.cu similarity index 71% rename from test/device/simt_maximum_plus_dsrgemm_tt_n_sm50.cu rename to test/device/sm50_simt_plus_mult_f64_srgemm_tn_t.cu index 92d7597..98001cf 100644 --- a/test/device/simt_maximum_plus_dsrgemm_tt_n_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f64_srgemm_tn_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +974,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +1044,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1037,28 +1079,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1114,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1149,97 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1254,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1289,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1324,27 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::RowMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1359,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1429,62 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tn_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_minimum_plus_dsrgemm_nt_t_sm50.cu b/test/device/sm50_simt_plus_mult_f64_srgemm_tt_n.cu similarity index 71% rename from test/device/simt_minimum_plus_dsrgemm_nt_t_sm50.cu rename to test/device/sm50_simt_plus_mult_f64_srgemm_tt_n.cu index cdb73a4..d27f366 100644 --- a/test/device/simt_minimum_plus_dsrgemm_nt_t_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f64_srgemm_tt_n.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,28 +29,27 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -65,28 +64,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -101,28 +99,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -137,28 +134,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -173,28 +169,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -209,28 +204,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -245,28 +239,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -281,28 +274,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -317,28 +309,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -353,28 +344,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -389,28 +379,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -425,28 +414,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -461,28 +449,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -497,28 +484,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -533,28 +519,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -569,28 +554,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -605,28 +589,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -641,28 +624,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -677,28 +659,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -713,28 +694,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -749,28 +729,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -785,28 +764,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -821,28 +799,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -857,28 +834,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -893,28 +869,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -929,28 +904,62 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -965,28 +974,62 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1001,28 +1044,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1037,28 +1079,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1073,28 +1114,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1109,28 +1149,97 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1145,28 +1254,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1181,28 +1289,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1217,28 +1324,27 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // - precision, cutlass::layout::ColumnMajor, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1253,28 +1359,62 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; @@ -1289,28 +1429,62 @@ TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_minimum_plus_dsrgemm_nt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::minimum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // precision, cutlass::layout::ColumnMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tt_n, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // precision, OpClass, SmArch, // ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; diff --git a/test/device/simt_maximum_plus_dsrgemm_tt_t_sm50.cu b/test/device/sm50_simt_plus_mult_f64_srgemm_tt_t.cu similarity index 71% rename from test/device/simt_maximum_plus_dsrgemm_tt_t_sm50.cu rename to test/device/sm50_simt_plus_mult_f64_srgemm_tt_t.cu index 8600765..e406a0a 100644 --- a/test/device/simt_maximum_plus_dsrgemm_tt_t_sm50.cu +++ b/test/device/sm50_simt_plus_mult_f64_srgemm_tt_t.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2020, Vijay Thakkar (thakkarv@gatech.edu). +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). **************************************************************************************************/ ///////////////////////////////////////////////////////////////// // THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // @@ -29,25 +29,24 @@ // Warps / Block: 1 x 1 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -65,25 +64,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 8x32x8_8x32x1_2x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -101,25 +99,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x32x8_16x32x1_4x4_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -137,25 +134,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x64x8_16x64x1_4x8_4x8_1x1) { // Warps / Block: 1 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -173,25 +169,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_32x32x1_8x4_4x8_1x1) { // Warps / Block: 1 x 2 // Threadblock: 8 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -209,25 +204,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 8x32x8_8x16x1_2x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 8 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<8, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -245,25 +239,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 8x64x8_8x32x1_2x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -281,25 +274,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x32x8_16x16x1_4x2_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -317,25 +309,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x64x8_16x32x1_4x4_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 16 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -353,25 +344,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x128x8_16x64x1_4x8_4x8_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -389,25 +379,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_32x16x1_4x4_8x4_1x2) { // Warps / Block: 1 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -425,25 +414,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x64x8_32x32x1_8x4_4x8_1x2) { // Warps / Block: 2 x 1 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -461,25 +449,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_16x32x1_4x4_4x8_2x1) { // Warps / Block: 2 x 1 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -497,25 +484,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x32x8_32x32x1_8x4_4x8_2x1) { // Warps / Block: 2 x 2 // Threadblock: 16 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -533,25 +519,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x32x8_8x16x1_2x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 16 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -569,25 +554,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x64x8_8x32x1_2x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -605,25 +589,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_16x16x1_4x2_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -641,25 +624,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x64x8_16x32x1_4x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -677,25 +659,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x128x8_16x64x1_4x8_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -713,25 +694,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x32x8_32x16x1_4x4_8x4_2x2) { // Warps / Block: 2 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -749,25 +729,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x64x8_32x32x1_8x4_4x8_2x2) { // Warps / Block: 2 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -785,25 +764,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 128x32x8_64x16x1_8x4_8x4_2x2) { // Warps / Block: 2 x 4 // Threadblock: 16 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -821,25 +799,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x64x16_8x16x1_2x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 16 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -857,25 +834,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 16x128x16_8x32x1_2x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -893,25 +869,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_16x8x1_2x2_8x4_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -929,25 +904,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x64x8_16x16x1_4x2_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 32 x 128 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 8 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 32 x 256 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x256x8_16x64x1_4x8_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -965,25 +974,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x128x8_16x32x1_4x4_4x8_2x4) { // Warps / Block: 2 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 2 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x128x8_32x32x1_8x4_4x8_2x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1001,25 +1044,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x64x8_32x16x1_4x4_8x4_2x4) { // Warps / Block: 4 x 2 // Threadblock: 32 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1037,25 +1079,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x32x8_8x16x1_2x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1073,25 +1114,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x32x8_16x16x1_4x2_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1109,25 +1149,94 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x64x8_16x32x1_4x4_4x8_4x2) { // Warps / Block: 4 x 2 // Threadblock: 128 x 32 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 8>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 2 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 0) +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 128x64x8_32x32x1_8x4_4x8_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 8 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 2 +// Threadblock: 256 x 32 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 256x32x8_64x16x1_8x4_8x4_4x2) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<256, 32, 8>; + using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1145,25 +1254,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 128x32x8_32x16x1_4x4_8x4_4x2) { // Warps / Block: 4 x 4 // Threadblock: 32 x 64 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 64, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 16, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1181,25 +1289,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x64x16_8x16x1_2x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 32 x 128 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<8, 32, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1217,25 +1324,24 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 32x128x16_8x32x1_2x4_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1253,25 +1359,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x32x16_16x8x1_2x2_8x4_4x4) { // Warps / Block: 4 x 4 // Threadblock: 64 x 64 x 8 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 16, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 4 x 8 +// Warps / Block: 4 x 4 +// Threadblock: 64 x 128 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 64x128x8_16x32x1_4x4_4x8_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 8>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // @@ -1289,25 +1429,59 @@ TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 64x64x8_16x16x1_4x2_4x8_4x4) { // Warps / Block: 4 x 4 // Threadblock: 128 x 32 x 16 #if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 2) -TEST(SM50_device_maximum_plus_dsrgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 128x32x16_32x8x1_4x2_8x4_4x4) { using precision = double; using OpClass = cutlass::arch::OpClassSimt; using SmArch = cutlass::arch::Sm50; + using RingOp = cuasr::plus_mult; using ThreadblockShape = cutlass::gemm::GemmShape<128, 32, 16>; using WarpShape = cutlass::gemm::GemmShape<32, 8, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; - using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< // - precision, precision, precision, precision, OpClass, // - cuasr::maximum, cuasr::plus, SmArch>; + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; + + using EpilogueOutputOp = Config::EpilogueOutputOp; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, OpClass, SmArch, // + ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, // + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Elements / Thread: 4 x 4 +// Threads / Warp: 8 x 4 +// Warps / Block: 4 x 4 +// Threadblock: 128 x 64 x 8 +#if defined(CUASR_TEST_LEVEL) and (CUASR_TEST_LEVEL >= 1) +TEST(SM50_device_plus_mult_f64_srgemm_tt_t, 128x64x8_32x16x1_4x4_8x4_4x4) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm50; + + using RingOp = cuasr::plus_mult; + using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 8>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + + using Config = typename cuasr::gemm::device::DefaultSemiRingConfiguration< + precision, precision, precision, precision, + RingOp, OpClass, SmArch>; - using AddOp = Config::AdditionOp; - using MultOp = Config::MultiplicationOp; using EpilogueOutputOp = Config::EpilogueOutputOp; using Srgemm = cuasr::gemm::device::Srgemm< // - AddOp, MultOp, // + RingOp, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // precision, cutlass::layout::RowMajor, // diff --git a/test/device/sm80_defaults.cu b/test/device/sm80_defaults.cu new file mode 100644 index 0000000..741738e --- /dev/null +++ b/test/device/sm80_defaults.cu @@ -0,0 +1,3481 @@ +/*************************************************************************************************** +* Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). +**************************************************************************************************/ +///////////////////////////////////////////////////////////////// +// THIS TEST FILE IS GENERATED AUTOMATICALLY : DO NOT MODIFY // +///////////////////////////////////////////////////////////////// + +#include "gtest/gtest.h" + +/// from upstream cutlass +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +/// from cuasr lib +#include "cuasr/gemm/device/default_srgemm_configuration.h" +#include "cuasr/gemm/device/srgemm.h" +#include "cuasr/functional.h" + +/// from cuasr tools +#include "cuasr/reference/srgemm/host_srgemm.h" + +/// from local test dir +#include "testbed.h" + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_plus_mult_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::plus_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_plus_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_plus_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_plus; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_max_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_max; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_min_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_min; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_min_mult_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::min_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_max_mult_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::max_mult; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f64_tt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f64_tt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f64_tn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f64_tn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f64_nt_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f64_nt_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f64_nn_n) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f64_nn_t) { + using precision = double; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f32_tt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f32_tt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f32_tn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f32_tn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f32_nt_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f32_nt_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f32_nn_n) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, f32_nn_t) { + using precision = float; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, s32_tt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, s32_tt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, s32_tn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, s32_tn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, s32_nt_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, s32_nt_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, s32_nn_n) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + +/////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_default_or_and_srgemm, s32_nn_t) { + using precision = int; + using OpClass = cutlass::arch::OpClassSimt; + using SmArch = cutlass::arch::Sm80; + using RingOp = cuasr::or_and; + + using Srgemm = cuasr::gemm::device::Srgemm< // + RingOp, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::ColumnMajor, // + precision, cutlass::layout::RowMajor, + precision, OpClass, SmArch>; + + EXPECT_TRUE(cuasr::test::gemm::device::TestAllGemm()); +} + diff --git a/test/device/testbed.h b/test/device/testbed.h index 6e6083f..f30e50c 100644 --- a/test/device/testbed.h +++ b/test/device/testbed.h @@ -1,3 +1,36 @@ +/*************************************************************************************************** + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + #include #include #include @@ -162,22 +195,16 @@ struct Testbed { if (!passed) { // record failed test cases to a file for debug records - std::string add_op_name_full(abi::__cxa_demangle( - typeid(typename Srgemm::AdditionOp).name(), // - nullptr, nullptr, nullptr)); - - std::string mult_op_name_full(abi::__cxa_demangle( - typeid(typename Srgemm::MultiplicationOp).name(), // + std::string ring_op_name_full(abi::__cxa_demangle( + typeid(typename Srgemm::RingOp).name(), // nullptr, nullptr, nullptr)); - std::string add_op_name( - add_op_name_full.substr(0, add_op_name_full.find_first_of('<'))); - std::string mult_op_name( - mult_op_name_full.substr(0, mult_op_name_full.find_first_of('<'))); + std::string ring_op_name( + ring_op_name_full.substr(0, ring_op_name_full.find_first_of('<'))); std::stringstream fname; fname << "error_Srgemm_device_" << problem_size.m() << 'x' << problem_size.n() - << 'x' << problem_size.k() << '_' << add_op_name << '_' << mult_op_name << '_' + << 'x' << problem_size.k() << '_' << ring_op_name << '_' << Srgemm::ThreadblockShape::kM << 'x' << Srgemm::ThreadblockShape::kN << 'x' << Srgemm::ThreadblockShape::kK << '_' << Srgemm::WarpShape::kM << 'x' << Srgemm::WarpShape::kN << 'x' << Srgemm::WarpShape::kK << ".txt"; @@ -186,9 +213,7 @@ struct Testbed { file << "problem: " << problem_size << ", alpha: " << alpha << ", beta: " << beta << "\n\n"; - file << "Addition operator: " << add_op_name_full << '\n'; - file << "Multiplication operator: " << mult_op_name_full << '\n'; - + file << "Ring Op: " << ring_op_name_full << '\n'; file << "A =\n" << tensor_A.host_view() << "\nB =\n" << tensor_B.host_view() << "\nC =\n" @@ -204,8 +229,7 @@ struct Testbed { bool verify( cutlass::gemm::GemmCoord problem_size, ElementCompute alpha, ElementCompute beta) { cuasr::reference::host::Srgemm< - typename Srgemm::AdditionOp, // - typename Srgemm::MultiplicationOp, // + typename Srgemm::RingOp, // typename Srgemm::ElementA, typename Srgemm::LayoutA, // typename Srgemm::ElementB, typename Srgemm::LayoutB, // typename Srgemm::ElementC, typename Srgemm::LayoutC, // @@ -217,7 +241,7 @@ struct Testbed { reference_srgemm( problem_size, alpha, tensor_A.host_ref(), tensor_B.host_ref(), // beta, tensor_C.host_ref(), reference_D.host_ref(), // - Srgemm::AdditionOp::Identity); + Srgemm::RingOp::AddIdentity); return compare_reference(problem_size, alpha, beta); } @@ -226,8 +250,8 @@ struct Testbed { bool run(cutlass::gemm::GemmCoord problem_size, int split_k_slices = 1, - ElementCompute alpha = ElementCompute(Srgemm::MultiplicationOp::Identity), - ElementCompute beta = ElementCompute(Srgemm::MultiplicationOp::Identity)) { + ElementCompute alpha = ElementCompute(Srgemm::RingOp::MultIdentity), + ElementCompute beta = ElementCompute(Srgemm::RingOp::MultIdentity)) { this->initialize(problem_size); // Initialize the GEMM operator @@ -310,19 +334,19 @@ bool TestAllGemm() { ? 4 : kAlignment; - int problem_size_m[] = { kAlignmentM, 512 - 3 * kAlignmentM }; + int problem_size_m[] = { 234, kAlignmentM, 512 - 3 * kAlignmentM }; - int problem_size_n[] = { kAlignmentN, 512 - 2 * kAlignmentN }; + int problem_size_n[] = { 239, kAlignmentN, 512 - 2 * kAlignmentN }; int problem_size_k[] - = { kAlignmentK, + = { 237, kAlignmentK, Srgemm::ThreadblockShape::kK * (Srgemm::kStages + 1) - kAlignmentK }; // TODO: add split-K SRGEMM int split_k_slices[] = { 1, 2, 3, 8 }; - double problem_alpha[] = { Srgemm::MultiplicationOp::Identity }; - double problem_beta[] = { Srgemm::MultiplicationOp::Annihilator }; + double problem_alpha[] = { Srgemm::RingOp::MultIdentity }; + double problem_beta[] = { Srgemm::RingOp::MultAnnihilator }; Testbed testbed; using ElementCompute = typename Srgemm::EpilogueOutputOp::ElementCompute; diff --git a/test/harness.cpp b/test/harness.cpp index 6d8d54f..e98c4eb 100644 --- a/test/harness.cpp +++ b/test/harness.cpp @@ -1,3 +1,7 @@ +/*************************************************************************************************** + * Copyright (c) 2022, Vijay Thakkar (thakkarv@gatech.edu). + **************************************************************************************************/ + #include "gtest/gtest.h" auto main(int argc, char **argv) -> int { diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt deleted file mode 100644 index ff04a4f..0000000 --- a/test/regress/CMakeLists.txt +++ /dev/null @@ -1,42 +0,0 @@ -# cuasr library configuration -add_library(deprecated_libfwgpu ${cuASR_LIB_TYPE} - ./src/cutlass_srgemm.cu - ./src/utils.cu -) -target_include_directories(deprecated_libfwgpu - PUBLIC ${PROJECT_SOURCE_DIR}/include ${CUDA_INCLUDE_DIRS} - PRIVATE ${PROJECT_SOURCE_DIR}/cutlass/include - PRIVATE ${PROJECT_SOURCE_DIR}/test/regress/include -) -target_compile_options(deprecated_libfwgpu - PUBLIC - # C++ compiler flags - $<$,$>: - ${cuASR_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE}}> - - # CUDA compiler flags - $<$,$>: - ${cuASR_CUDA_FLAGS_${uppercase_CMAKE_BUILD_TYPE}}> -) - -### Matrix tests -add_executable(Matrix_tests ${PROJECT_SOURCE_DIR}/test/harness.cpp Matrix_test.cpp) -target_include_directories(Matrix_tests - PRIVATE gtest/googletest/include - PRIVATE ${PROJECT_SOURCE_DIR}/test/regress/include) -target_link_libraries(Matrix_tests gtest deprecated_libfwgpu) -add_test( - NAME Matrix_tests - COMMAND Matrix_tests -) - -### SemiRing GEMM tests -add_executable(tropical_gemm_tests ${PROJECT_SOURCE_DIR}/test/harness.cpp Srgemm_test.cu) -target_include_directories(tropical_gemm_tests - PRIVATE gtest/googletest/include - PRIVATE ${PROJECT_SOURCE_DIR}/test/regress/include) -target_link_libraries(tropical_gemm_tests gtest deprecated_libfwgpu) -add_test( - NAME tropical_gemm_tests - COMMAND tropical_gemm_tests -) diff --git a/test/regress/Matrix_test.cpp b/test/regress/Matrix_test.cpp deleted file mode 100644 index 60ca467..0000000 --- a/test/regress/Matrix_test.cpp +++ /dev/null @@ -1,166 +0,0 @@ -#include "gtest/gtest.h" - -#include "fwgpu/Matrix.hpp" - -TEST(cuASR_Matrix, BasicConstructorCorrect) { - auto x = fwgpu::Matrix(6, 2); - for (auto i = 0u; i < 12; ++i) { - x(i) = (float)i; - } - - EXPECT_EQ(size_t { 12 }, x.size()); - EXPECT_EQ(size_t { 12 * sizeof(float) }, x.bytesize()); - EXPECT_EQ(size_t { 6 }, x.num_rows()); - EXPECT_EQ(size_t { 2 }, x.num_cols()); - EXPECT_FLOAT_EQ(10.0f, x(10)); - EXPECT_FLOAT_EQ(0.0f, x(0, 0)); - EXPECT_FLOAT_EQ(8.0f, x(2, 1)); - EXPECT_FLOAT_EQ(11.0f, x(5, 1)); -} - -TEST(cuASR_Matrix, InitializerListConstructorCorrect) { - // [8.0 3.0 0.0 1.0] - // [2.0 5.0 4.0 9.0] - // [7.0 6.0 10. 13.] - auto x = fwgpu::Matrix( - 3, 4, { 8.0, 2.0, 7.0, 3.0, 5.0, 6.0, 0.0, 4.0, 10.0, 1.0, 9.0, 13.0 }); - - EXPECT_EQ(size_t { 12 }, x.size()); - EXPECT_EQ(size_t { 12 * sizeof(float) }, x.bytesize()); - EXPECT_EQ(size_t { 3 }, x.num_rows()); - EXPECT_EQ(size_t { 4 }, x.num_cols()); - EXPECT_FLOAT_EQ(8.0f, x(0, 0)); - EXPECT_FLOAT_EQ(1.0f, x(0, 3)); - EXPECT_FLOAT_EQ(10.0f, x(2, 2)); -} - -TEST(cuASR_Matrix, RandomFloatMatrixConstructorCorrect) { - size_t const seed = 8; - auto const minimum = 1.0545; - auto const maximum = 28.1; - auto x = fwgpu::Matrix(9, 8, seed, minimum, maximum); - - EXPECT_EQ(size_t { 9 * 8 }, x.size()); - EXPECT_EQ(size_t { 9 * 8 * sizeof(double) }, x.bytesize()); - EXPECT_EQ(size_t { 9 }, x.num_rows()); - EXPECT_EQ(size_t { 8 }, x.num_cols()); - - for (auto i = 0u; i < x.size(); ++i) { - double const val = x(i); - EXPECT_TRUE((val >= minimum && val <= maximum)); - } -} - -TEST(cuASR_Matrix, RandomIntMatrixConstructorCorrect) { - size_t const seed = 8; - auto const minimum = 1; - auto const maximum = 128; - auto x = fwgpu::Matrix(7, 5, seed, minimum, maximum); - - EXPECT_EQ(size_t { 7 * 5 }, x.size()); - EXPECT_EQ(size_t { 7 * 5 * sizeof(int) }, x.bytesize()); - EXPECT_EQ(size_t { 7 }, x.num_rows()); - EXPECT_EQ(size_t { 5 }, x.num_cols()); - - for (auto i = 0u; i < x.size(); ++i) { - int const val = x(i); - EXPECT_TRUE((val >= minimum && val <= maximum)); - } -} - -TEST(cuASR_Matrix, CopyConstructorCorrect) { - auto from = fwgpu::Matrix(5, 7, 0.0f); - auto to = from; - EXPECT_TRUE(from == to); -} - -TEST(cuASR_Matrix, MoveConstructorCorrect) { - auto from = fwgpu::Matrix(5, 7, 0.0f); - EXPECT_EQ(from.num_rows(), 5); - EXPECT_EQ(from.num_cols(), 7); - EXPECT_TRUE(from.get_buf() != nullptr); - - auto to = fwgpu::Matrix(std::move(from)); - EXPECT_EQ(to.num_rows(), 5); - EXPECT_EQ(to.num_cols(), 7); - EXPECT_TRUE(to.get_buf() != nullptr); - - EXPECT_EQ(from.num_rows(), 0); - EXPECT_EQ(from.num_cols(), 0); - EXPECT_TRUE(from.get_buf() == nullptr); -} - -TEST(cuASR_Matrix, ConstantConstructorCorrect) { - auto mat = fwgpu::Matrix(6, 2, 42); - EXPECT_EQ(mat.num_rows(), 6); - EXPECT_EQ(mat.num_cols(), 2); - EXPECT_EQ(mat(0, 0), 42); - EXPECT_EQ(mat(3, 0), 42); - EXPECT_EQ(mat(3, 1), 42); - EXPECT_EQ(mat(5, 1), 42); -} - -TEST(cuASR_Matrix, BufferConstructorCorrect) { - std::vector matvals(12, 42); - auto mat = fwgpu::Matrix(6, 2, matvals.data()); - - EXPECT_EQ(mat.num_rows(), 6); - EXPECT_EQ(mat.num_cols(), 2); - EXPECT_EQ(mat(0, 0), 42); - EXPECT_EQ(mat(3, 0), 42); - EXPECT_EQ(mat(3, 1), 42); - EXPECT_EQ(mat(5, 1), 42); -} - -TEST(cuASR_Matrix, CopyAssignmentCorrect) { - auto from = fwgpu::Matrix(5, 7, 0.0f); - fwgpu::Matrix to(1, 1); - to = from; - EXPECT_TRUE(from == to); -} - -TEST(cuASR_Matrix, ColumnMajorLayoutCorrect) { - auto mat = fwgpu::Matrix( - 4, 4, - { - 0.840187728, 0.911647379, 0.277774721, 0.364784479, // - 0.394382924, 0.197551370, 0.553969979, 0.513400912, // - 0.729605675, 0.335222751, 0.477397054, 0.952229738, // - 0.798440039, 0.768229604, 0.628870904, 0.916195095 // - }); - - // corners - EXPECT_FLOAT_EQ(mat(0, 0), 0.840187728); - EXPECT_FLOAT_EQ(mat(0, 3), 0.798440039); - EXPECT_FLOAT_EQ(mat(3, 0), 0.364784479); - EXPECT_FLOAT_EQ(mat(3, 3), 0.916195095); - - // middle 2x2 - EXPECT_FLOAT_EQ(mat(1, 1), 0.197551370); - EXPECT_FLOAT_EQ(mat(1, 2), 0.335222751); - EXPECT_FLOAT_EQ(mat(2, 1), 0.553969979); - EXPECT_FLOAT_EQ(mat(2, 2), 0.477397054); -} - -TEST(cuASR_Matrix, RowMajorLayoutCorrect) { - auto mat = fwgpu::Matrix( - 4, 4, - { - 0.840187728, 0.911647379, 0.277774721, 0.364784479, // - 0.394382924, 0.197551370, 0.553969979, 0.513400912, // - 0.729605675, 0.335222751, 0.477397054, 0.952229738, // - 0.798440039, 0.768229604, 0.628870904, 0.916195095 // - }); - - // corners - EXPECT_FLOAT_EQ(mat(0, 0), 0.840187728); - EXPECT_FLOAT_EQ(mat(0, 3), 0.364784479); - EXPECT_FLOAT_EQ(mat(3, 0), 0.798440039); - EXPECT_FLOAT_EQ(mat(3, 3), 0.916195095); - - // middle 2x2 - EXPECT_FLOAT_EQ(mat(1, 1), 0.197551370); - EXPECT_FLOAT_EQ(mat(1, 2), 0.553969979); - EXPECT_FLOAT_EQ(mat(2, 1), 0.335222751); - EXPECT_FLOAT_EQ(mat(2, 2), 0.477397054); -} diff --git a/test/regress/Srgemm_test.cu b/test/regress/Srgemm_test.cu deleted file mode 100644 index b079827..0000000 --- a/test/regress/Srgemm_test.cu +++ /dev/null @@ -1,605 +0,0 @@ -#include "gtest/gtest.h" - -#include "fwgpu/Matrix.hpp" -#include "fwgpu/cpu_srgemm.hpp" -#include "fwgpu/gpu_srgemm.cuh" -#include "fwgpu/gpu_srgemm.hpp" -#include "fwgpu/utils.hpp" -#include "utils.cuh" - -#include -#include - -#include - -TEST(regress_cuASR_Srgemm, CpuNaiveCorrect) { - auto a = fwgpu::Matrix( - 4, 2, - { - 0.840187728, 0.911647379, 0.277774721, 0.364784479, // - 0.394382924, 0.19755137, 0.553969979, 0.513400912 // - }); - - auto b = fwgpu::Matrix( - 2, 4, - { - 0.840187728, 0.911647379, // - 0.394382924, 0.19755137, // - 0.729605675, 0.335222751, // - 0.798440039, 0.768229604 // - }); - - auto c = fwgpu::Matrix( - 4, 4, - { - 0.840187728, 0.911647379, 0.277774721, 0.364784479, // - 0.394382924, 0.197551370, 0.553969979, 0.513400912, // - 0.729605675, 0.335222751, 0.477397054, 0.952229738, // - 0.798440039, 0.768229604, 0.628870904, 0.916195095 // - }); - - auto correct = fwgpu::Matrix( - 4, 4, - { - 0.840187728, 0.911647379, 0.277774721, 0.364784479, // - 0.394382924, 0.197551370, 0.553969979, 0.513400912, // - 0.729605675, 0.335222751, 0.477397054, 0.848623633, // - 0.798440039, 0.768229604, 0.628870904, 0.916195095 // - }); - - fwgpu::cpu_srgemm_naive( - 4, 4, 2, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c.get_buf(), c.num_rows()); - - EXPECT_EQ(correct.size(), c.size()); - EXPECT_EQ(correct.num_rows(), c.num_rows()); - EXPECT_EQ(correct.num_cols(), c.num_cols()); - for (auto i = 0ull; i < correct.size(); ++i) { - EXPECT_FLOAT_EQ(correct[i], c[i]); - } -} - -TEST(regress_cuASR_Srgemm, CutlassCorrect) { - auto a = fwgpu::Matrix( - 4, 2, - { - 0.840187728, 0.911647379, 0.277774721, 0.364784479, // - 0.394382924, 0.19755137, 0.553969979, 0.513400912 // - }); - - auto b = fwgpu::Matrix( - 2, 4, - { - 0.840187728, 0.911647379, // - 0.394382924, 0.19755137, // - 0.729605675, 0.335222751, // - 0.798440039, 0.768229604 // - }); - - auto c = fwgpu::Matrix( - 4, 4, - { - 0.840187728, 0.911647379, 0.277774721, 0.364784479, // - 0.394382924, 0.197551370, 0.553969979, 0.513400912, // - 0.729605675, 0.335222751, 0.477397054, 0.952229738, // - 0.798440039, 0.768229604, 0.628870904, 0.916195095 // - }); - - auto correct = fwgpu::Matrix( - 4, 4, - { - 0.840187728, 0.911647379, 0.277774721, 0.364784479, // - 0.394382924, 0.197551370, 0.553969979, 0.513400912, // - 0.729605675, 0.335222751, 0.477397054, 0.848623633, // - 0.798440039, 0.768229604, 0.628870904, 0.916195095 // - }); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - fwgpu::cutlass_srsgemm_nn( - 4, 4, 2, d_A, a.num_rows(), d_B, b.num_rows(), d_C, c.num_rows(), true); - fwgpu::memcpy_d2h(c.get_buf(), d_C, c.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(correct.size(), c.size()); - EXPECT_EQ(correct.num_rows(), c.num_rows()); - EXPECT_EQ(correct.num_cols(), c.num_cols()); - for (auto i = 0ull; i < correct.size(); ++i) { - EXPECT_FLOAT_EQ(correct[i], c[i]); - } -} - -TEST(regress_cuASR_Srgemm, GpuNaiveEqCpuNaive) { - auto m = 128; - auto k = 32; - auto n = 128; - auto a = fwgpu::Matrix(m, k, 0xCAFED00D, 1.0, 1000.0); - auto b = fwgpu::Matrix(k, n, 0xCAFED00D, 1.0, 1000.0); - auto c_cpu_naive = fwgpu::Matrix(m, n, 0xCAFED00D, 1.0, 1000.0); - auto c_gpu_naive = c_cpu_naive; - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu_naive); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - fwgpu::cpu_srgemm_naive( - m, n, k, a.get_buf(), m, b.get_buf(), k, c_cpu_naive.get_buf(), m); - - dim3 threads(16, 16); - dim3 blocks((m - 1) / 16 + 1, (n - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>(m, n, k, d_A, m, d_B, k, d_C, m); - fwgpu::memcpy_d2h(c_gpu_naive.get_buf(), d_C, c_gpu_naive.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu_naive.size(), c_gpu_naive.size()); - EXPECT_EQ(c_cpu_naive.num_rows(), c_gpu_naive.num_rows()); - EXPECT_EQ(c_cpu_naive.num_cols(), c_gpu_naive.num_cols()); - for (auto i = 0ull; i < c_cpu_naive.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu_naive[i], c_gpu_naive[i]); - } -} - -TEST(regress_cuASR_Srgemm, GpuNaiveEqCutlass) { - auto N = 128; - auto a = fwgpu::Matrix(N, N, 0xCAFED00D, 1.0, 1000.0); - auto b = fwgpu::Matrix(N, N, 0xCAFED00D, 1.0, 1000.0); - auto c_gpu_naive = fwgpu::Matrix(N, N, 0.0f); - auto c_gpu_cutlass = c_gpu_naive; - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats( - a, b, c_gpu_naive, c_gpu_cutlass); - - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C_naive = std::get<2>(dptrs); - float *d_C_cutlass = std::get<3>(dptrs); - - dim3 threads(16, 16); - dim3 blocks((N - 1) / 16 + 1, (N - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>(N, N, N, d_A, N, d_B, N, d_C_naive, N); - fwgpu::memcpy_d2h(c_gpu_naive.get_buf(), d_C_naive, c_gpu_naive.bytesize()); - - fwgpu::cutlass_srsgemm_nn(N, N, N, d_A, N, d_B, N, d_C_cutlass, N, true); - fwgpu::memcpy_d2h(c_gpu_cutlass.get_buf(), d_C_cutlass, c_gpu_cutlass.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_gpu_naive.size(), c_gpu_cutlass.size()); - EXPECT_EQ(c_gpu_naive.num_rows(), c_gpu_cutlass.num_rows()); - EXPECT_EQ(c_gpu_naive.num_cols(), c_gpu_cutlass.num_cols()); - for (auto i = 0ull; i < c_gpu_naive.size(); ++i) { - EXPECT_FLOAT_EQ(c_gpu_naive[i], c_gpu_cutlass[i]); - } -} - -TEST(regress_cuASR_Srgemm, GpuNaiveEqCutlass_TS_Inner) { - auto m = 125; - auto n = 125; - auto k = 1000; - auto a = fwgpu::Matrix(m, k, 0xCAFED00D, 1.0, 1000.0); - auto b = fwgpu::Matrix(k, n, 0xCAFED00D, 1.0, 1000.0); - auto c_gpu_naive = fwgpu::Matrix(m, n, 0.0f); - auto c_gpu_cutlass = c_gpu_naive; - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats( - a, b, c_gpu_naive, c_gpu_cutlass); - - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C_naive = std::get<2>(dptrs); - float *d_C_cutlass = std::get<3>(dptrs); - - dim3 threads(16, 16); - dim3 blocks((m - 1) / 16 + 1, (n - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>(m, n, k, d_A, m, d_B, k, d_C_naive, m); - fwgpu::memcpy_d2h(c_gpu_naive.get_buf(), d_C_naive, c_gpu_naive.bytesize()); - - fwgpu::cutlass_srsgemm_nn(m, n, k, d_A, m, d_B, k, d_C_cutlass, m, true); - fwgpu::memcpy_d2h(c_gpu_cutlass.get_buf(), d_C_cutlass, c_gpu_cutlass.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_gpu_naive.size(), c_gpu_cutlass.size()); - EXPECT_EQ(c_gpu_naive.num_rows(), c_gpu_cutlass.num_rows()); - EXPECT_EQ(c_gpu_naive.num_cols(), c_gpu_cutlass.num_cols()); - for (auto i = 0ull; i < c_gpu_naive.size(); ++i) { - EXPECT_FLOAT_EQ(c_gpu_naive[i], c_gpu_cutlass[i]); - } -} - -TEST(regress_cuASR_Srgemm, GpuNaiveEqCutlass_TS_Outer) { - auto m = 1000; - auto n = 1000; - auto k = 125; - auto a = fwgpu::Matrix(m, k, 0xCAFED00D, 1.0, 1000.0); - auto b = fwgpu::Matrix(k, n, 0xCAFED00D, 1.0, 1000.0); - auto c_gpu_naive = fwgpu::Matrix(m, n, 0.0f); - auto c_gpu_cutlass = c_gpu_naive; - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats( - a, b, c_gpu_naive, c_gpu_cutlass); - - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C_naive = std::get<2>(dptrs); - float *d_C_cutlass = std::get<3>(dptrs); - - dim3 threads(16, 16); - dim3 blocks((m - 1) / 16 + 1, (n - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>(m, n, k, d_A, m, d_B, k, d_C_naive, m); - fwgpu::memcpy_d2h(c_gpu_naive.get_buf(), d_C_naive, c_gpu_naive.bytesize()); - - fwgpu::cutlass_srsgemm_nn(m, n, k, d_A, m, d_B, k, d_C_cutlass, m, true); - fwgpu::memcpy_d2h(c_gpu_cutlass.get_buf(), d_C_cutlass, c_gpu_cutlass.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_gpu_naive.size(), c_gpu_cutlass.size()); - EXPECT_EQ(c_gpu_naive.num_rows(), c_gpu_cutlass.num_rows()); - EXPECT_EQ(c_gpu_naive.num_cols(), c_gpu_cutlass.num_cols()); - for (auto i = 0ull; i < c_gpu_naive.size(); ++i) { - EXPECT_FLOAT_EQ(c_gpu_naive[i], c_gpu_cutlass[i]); - } -} - -TEST(regress_cuASR_Srgemm, CpuNaiveSubEqCutlassSub_TopLeft_2x2x2) { - auto m = 2; - auto n = 2; - auto k = 2; - auto a = fwgpu::Matrix(4, 2, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(2, 4, 1, 1.5f, 100.0f); - auto c_cpu = fwgpu::Matrix(4, 4, 1, 1.5f, 100.0f); - auto c_gpu = c_cpu; - - fwgpu::cpu_srgemm_naive( - m, n, k, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c_cpu.get_buf(), c_cpu.num_rows()); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - fwgpu::cutlass_srsgemm_nn( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C, c_gpu.num_rows(), true); - fwgpu::memcpy_d2h(c_gpu.get_buf(), d_C, c_gpu.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu.size(), c_gpu.size()); - EXPECT_EQ(c_cpu.num_rows(), c_gpu.num_rows()); - EXPECT_EQ(c_cpu.num_cols(), c_gpu.num_cols()); - for (auto i = 0ull; i < c_cpu.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu[i], c_gpu[i]); - } -} - -TEST(regress_cuASR_Srgemm, CpuNaiveSubEqCutlassSub_TopLeft_8x8x8) { - auto m = 8; - auto n = 8; - auto k = 8; - auto a = fwgpu::Matrix(16, 8, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(8, 16, 1, 1.5f, 100.0f); - auto c_cpu = fwgpu::Matrix(16, 16, 1, 1.5f, 100.0f); - auto c_gpu = c_cpu; - - fwgpu::cpu_srgemm_naive( - m, n, k, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c_cpu.get_buf(), c_cpu.num_rows()); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - fwgpu::cutlass_srsgemm_nn( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C, c_gpu.num_rows(), true); - fwgpu::memcpy_d2h(c_gpu.get_buf(), d_C, c_gpu.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu.size(), c_gpu.size()); - EXPECT_EQ(c_cpu.num_rows(), c_gpu.num_rows()); - EXPECT_EQ(c_cpu.num_cols(), c_gpu.num_cols()); - for (auto i = 0ull; i < c_cpu.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu[i], c_gpu[i]); - } -} - -TEST(regress_cuASR_Srgemm, CpuNaiveSubEqCutlassSub_TopLeft_128x128x8) { - auto m = 128; - auto n = 128; - auto k = 8; - auto a = fwgpu::Matrix(256, 8, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(8, 256, 1, 1.5f, 100.0f); - auto c_cpu = fwgpu::Matrix(256, 256, 1, 1.5f, 100.0f); - auto c_gpu = c_cpu; - - fwgpu::cpu_srgemm_naive( - m, n, k, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c_cpu.get_buf(), c_cpu.num_rows()); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - fwgpu::cutlass_srsgemm_nn( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C, c_gpu.num_rows(), true); - fwgpu::memcpy_d2h(c_gpu.get_buf(), d_C, c_gpu.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu.size(), c_gpu.size()); - EXPECT_EQ(c_cpu.num_rows(), c_gpu.num_rows()); - EXPECT_EQ(c_cpu.num_cols(), c_gpu.num_cols()); - for (auto i = 0ull; i < c_cpu.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu[i], c_gpu[i]); - } -} - -TEST(regress_cuASR_Srgemm, CpuNaiveSubEqGpuNaiveSub_TopLeft_2x2x2) { - auto m = 2; - auto n = 2; - auto k = 2; - auto a = fwgpu::Matrix(4, 2, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(2, 4, 1, 1.5f, 100.0f); - auto c_cpu = fwgpu::Matrix(4, 4, 1, 1.5f, 100.0f); - auto c_gpu = c_cpu; - - fwgpu::cpu_srgemm_naive( - m, n, k, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c_cpu.get_buf(), c_cpu.num_rows()); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - dim3 threads(16, 16); - dim3 blocks((m - 1) / 16 + 1, (n - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C, a.num_rows()); - fwgpu::memcpy_d2h(c_gpu.get_buf(), d_C, c_gpu.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu.size(), c_gpu.size()); - EXPECT_EQ(c_cpu.num_rows(), c_gpu.num_rows()); - EXPECT_EQ(c_cpu.num_cols(), c_gpu.num_cols()); - for (auto i = 0ull; i < c_cpu.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu[i], c_gpu[i]); - } -} - -TEST(regress_cuASR_Srgemm, CpuNaiveSubEqGpuNaiveSub_TopLeft_128x128x8) { - auto m = 128; - auto n = 128; - auto k = 8; - auto a = fwgpu::Matrix(256, 8, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(8, 256, 1, 1.5f, 100.0f); - auto c_cpu = fwgpu::Matrix(256, 256, 2, 1.5f, 100.0f); - auto c_gpu = c_cpu; - - fwgpu::cpu_srgemm_naive( - m, n, k, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c_cpu.get_buf(), c_cpu.num_rows()); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - dim3 threads(16, 16); - dim3 blocks((m - 1) / 16 + 1, (n - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C, a.num_rows()); - fwgpu::memcpy_d2h(c_gpu.get_buf(), d_C, c_gpu.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu.size(), c_gpu.size()); - EXPECT_EQ(c_cpu.num_rows(), c_gpu.num_rows()); - EXPECT_EQ(c_cpu.num_cols(), c_gpu.num_cols()); - for (auto i = 0ull; i < c_cpu.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu[i], c_gpu[i]); - } -} - -TEST(regress_cuASR_Srgemm, GpuNaiveSubEqCutlassSub_TopLeft_128x128x8) { - auto m = 128; - auto n = 128; - auto k = 8; - auto a = fwgpu::Matrix(256, 8, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(8, 256, 1, 1.5f, 100.0f); - auto c_gpu_naive = fwgpu::Matrix(256, 256, 2, 1.5f, 100.0f); - auto c_gpu_cutlass = c_gpu_naive; - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats( - a, b, c_gpu_naive, c_gpu_cutlass); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C_naive = std::get<2>(dptrs); - float *d_C_cutlass = std::get<3>(dptrs); - - dim3 threads(16, 16); - dim3 blocks((m - 1) / 16 + 1, (n - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C_naive, a.num_rows()); - fwgpu::memcpy_d2h(c_gpu_naive.get_buf(), d_C_naive, c_gpu_naive.bytesize()); - - fwgpu::cutlass_srsgemm_nn( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C_cutlass, - c_gpu_cutlass.num_rows(), true); - fwgpu::memcpy_d2h(c_gpu_cutlass.get_buf(), d_C_cutlass, c_gpu_cutlass.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_gpu_naive.size(), c_gpu_cutlass.size()); - EXPECT_EQ(c_gpu_naive.num_rows(), c_gpu_cutlass.num_rows()); - EXPECT_EQ(c_gpu_naive.num_cols(), c_gpu_cutlass.num_cols()); - for (auto i = 0ull; i < c_gpu_naive.size(); ++i) { - EXPECT_FLOAT_EQ(c_gpu_naive[i], c_gpu_cutlass[i]); - } -} - -TEST(regress_cuASR_Srgemm, CpuNaiveSubEqGpuNaiveSub_BottomRight_128x128x8) { - auto m = 128; - auto n = 128; - auto k = 8; - auto a = fwgpu::Matrix(256, 8, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(8, 256, 1, 1.5f, 100.0f); - auto c_cpu = fwgpu::Matrix(256, 256, 2, 1.5f, 100.0f); - auto c_gpu = c_cpu; - - fwgpu::cpu_srgemm_naive( - m, n, k, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c_cpu.get_buf() + (256 * 128) + 128, c_cpu.num_rows()); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - dim3 threads(16, 16); - dim3 blocks((m - 1) / 16 + 1, (n - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C + (256 * 128) + 128, - a.num_rows()); - fwgpu::memcpy_d2h(c_gpu.get_buf(), d_C, c_gpu.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu.size(), c_gpu.size()); - EXPECT_EQ(c_cpu.num_rows(), c_gpu.num_rows()); - EXPECT_EQ(c_cpu.num_cols(), c_gpu.num_cols()); - for (auto i = 0ull; i < c_cpu.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu[i], c_gpu[i]); - } -} - -TEST(regress_cuASR_Srgemm, GpuNaiveSubEqCutlassSub_BottomRight_128x128x8) { - auto m = 128; - auto n = 128; - auto k = 8; - auto a = fwgpu::Matrix(256, 8, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(8, 256, 1, 1.5f, 100.0f); - auto c_gpu_naive = fwgpu::Matrix(256, 256, 2, 1.5f, 100.0f); - auto c_gpu_cutlass = c_gpu_naive; - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats( - a, b, c_gpu_naive, c_gpu_cutlass); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C_naive = std::get<2>(dptrs); - float *d_C_cutlass = std::get<3>(dptrs); - - dim3 threads(16, 16); - dim3 blocks((m - 1) / 16 + 1, (n - 1) / 16 + 1); - fwgpu::gpu_srgemm_naive<<>>( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C_naive + (256 * 128) + 128, - a.num_rows()); - fwgpu::memcpy_d2h(c_gpu_naive.get_buf(), d_C_naive, c_gpu_naive.bytesize()); - - fwgpu::cutlass_srsgemm_nn( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C_cutlass + (256 * 128) + 128, - c_gpu_cutlass.num_rows(), true); - fwgpu::memcpy_d2h(c_gpu_cutlass.get_buf(), d_C_cutlass, c_gpu_cutlass.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_gpu_naive.size(), c_gpu_cutlass.size()); - EXPECT_EQ(c_gpu_naive.num_rows(), c_gpu_cutlass.num_rows()); - EXPECT_EQ(c_gpu_naive.num_cols(), c_gpu_cutlass.num_cols()); - for (auto i = 0ull; i < c_gpu_naive.size(); ++i) { - EXPECT_FLOAT_EQ(c_gpu_naive[i], c_gpu_cutlass[i]); - } -} - -TEST(regress_cuASR_Srgemm, CpuNaiveEqCutlass_Small_17x27x17) { - auto m = 17; - auto n = 27; - auto k = 17; - auto a = fwgpu::Matrix(17, 17, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(17, 27, 0, 1.5f, 100.0f); - auto c_cpu = fwgpu::Matrix(17, 27, 1, 1.5f, 100.0f); - auto c_gpu = c_cpu; - - fwgpu::cpu_srgemm_naive( - m, n, k, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c_cpu.get_buf(), c_cpu.num_rows()); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - fwgpu::cutlass_srsgemm_nn( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C, c_gpu.num_rows(), true); - fwgpu::memcpy_d2h(c_gpu.get_buf(), d_C, c_gpu.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu.size(), c_gpu.size()); - EXPECT_EQ(c_cpu.num_rows(), c_gpu.num_rows()); - EXPECT_EQ(c_cpu.num_cols(), c_gpu.num_cols()); - for (auto i = 0ull; i < c_cpu.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu[i], c_gpu[i]); - } -} - -TEST(regress_cuASR_Srgemm, CpuNaiveSubEqCutlassSub_Small_7x5x6) { - auto m = 7; - auto n = 6; - auto k = 5; - auto a = fwgpu::Matrix(17, 17, 0, 1.5f, 100.0f); - auto b = fwgpu::Matrix(17, 27, 0, 1.5f, 100.0f); - auto c_cpu = fwgpu::Matrix(17, 27, 1, 1.5f, 100.0f); - auto c_gpu = c_cpu; - - fwgpu::cpu_srgemm_naive( - m, n, k, // - a.get_buf(), a.num_rows(), // - b.get_buf(), b.num_rows(), // - c_cpu.get_buf(), c_cpu.num_rows()); - - auto dptrs = fwgpu::internal::alloc_and_init_device_gemm_mats(a, b, c_gpu); - float *d_A = std::get<0>(dptrs); - float *d_B = std::get<1>(dptrs); - float *d_C = std::get<2>(dptrs); - - fwgpu::cutlass_srsgemm_nn( - m, n, k, d_A, a.num_rows(), d_B, b.num_rows(), d_C, c_gpu.num_rows(), true); - fwgpu::memcpy_d2h(c_gpu.get_buf(), d_C, c_gpu.bytesize()); - - fwgpu::internal::dealloc_device_gemm_mats(dptrs); - - EXPECT_EQ(c_cpu.size(), c_gpu.size()); - EXPECT_EQ(c_cpu.num_rows(), c_gpu.num_rows()); - EXPECT_EQ(c_cpu.num_cols(), c_gpu.num_cols()); - for (auto i = 0ull; i < c_cpu.size(); ++i) { - EXPECT_FLOAT_EQ(c_cpu[i], c_gpu[i]); - } -} diff --git a/test/regress/include/fwgpu/Matrix.hpp b/test/regress/include/fwgpu/Matrix.hpp deleted file mode 100644 index 224c7a3..0000000 --- a/test/regress/include/fwgpu/Matrix.hpp +++ /dev/null @@ -1,248 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace fwgpu { - -struct ColumnMajor { - size_t m_rows; - size_t m_cols; - - ColumnMajor() = delete; - ColumnMajor(size_t rows, size_t cols) - : m_rows(rows) - , m_cols(cols) {}; - - auto linearize(size_t row_idx, size_t col_idx) const noexcept -> size_t { - return row_idx + (m_rows * col_idx); - } -}; - -struct RowMajor { - size_t m_rows; - size_t m_cols; - - RowMajor() = delete; - RowMajor(size_t rows, size_t cols) - : m_rows(rows) - , m_cols(cols) {}; - - auto linearize(size_t row_idx, size_t col_idx) const noexcept -> size_t { - return (row_idx * m_cols) + col_idx; - } -}; - -/* - * Matrix datastructure for a tightly packed 2D array. - * ElementT = float and column major by default. - **/ -template -class Matrix { -private: - Layout m_layout; - std::vector m_host_buf; - -public: - /* - * No default contructor. - **/ - Matrix() = delete; - - /* - * Default distructor. - **/ - ~Matrix() = default; - - /* - * De-facto default constructor: allocate ElementT buffer of size rows*cols - **/ - Matrix(size_t rows, size_t cols) - : m_layout(rows, cols) - , m_host_buf(rows * cols) { } - - /* - * Assign buf from external source. - * TODO: not sure we should allow this? - **/ - Matrix(size_t rows, size_t cols, ElementT *buf) - : m_layout(rows, cols) - , m_host_buf(buf, buf + (rows * cols)) { } - - /* - * Allocates and initializes the matrix with input value. - **/ - Matrix(size_t rows, size_t cols, ElementT val) - : m_layout(rows, cols) - , m_host_buf(rows * cols, val) { } - - /* - * Random Fill Constructor: allocates and initializes the matrix - * with random numbers in the input range. - **/ - Matrix( - size_t rows, - size_t cols, - size_t seed, - ElementT min = ElementT(0.0), - ElementT max = ElementT(1.0)) - : m_layout(rows, cols) - , m_host_buf(rows * cols) { - using Distribution = std::conditional_t< - std::is_integral::value, // if ElementT is integral - std::uniform_int_distribution, // use int dist - std::uniform_real_distribution // otherwise floating point dist - >; - auto rng = std::mt19937_64(seed); - auto dist = Distribution(min, max); - for (auto i = 0ull; i < (rows * cols); ++i) { - m_host_buf[i] = dist(rng); - } - } - - /* - * Allocates and initializes the matrix from an initializer list. - * This mainly makes testing easier. - **/ - Matrix(size_t rows, size_t cols, const std::initializer_list &elements) - : m_layout(rows, cols) - , m_host_buf(rows * cols) { - auto i = 0ull; - for (auto val : elements) { - m_host_buf[i++] = val; - } - } - - /* - * Copy constructor: deep copy other - **/ - Matrix(const Matrix &other) - : m_layout(other.m_layout) - , m_host_buf(other.m_host_buf) { } - - /* - * Move constructor: sink other into this - */ - Matrix(Matrix &&other) - : m_layout(other.m_layout) - , m_host_buf(std::move(other.m_host_buf)) { - other.m_layout.m_rows = 0; - other.m_layout.m_cols = 0; - } - - /* - * Copy assignment operator. - **/ - auto operator=(const Matrix &other) -> Matrix & { - m_layout = other.m_layout; - m_host_buf = other.m_host_buf; - return *this; - } - - /* - * Returns a non-owning, const pointer to the backing buffer of type ElementT[]. - **/ - auto get_buf() const noexcept -> const ElementT * { return m_host_buf.data(); } - - /* - * Returns a non-owning pointer to the backing buffer of type ElementT[]. - **/ - auto get_buf() noexcept -> ElementT * { return m_host_buf.data(); } - - /* - * Returns total number of elements stored in the matrix. - **/ - auto size() const noexcept -> size_t { return m_layout.m_rows * m_layout.m_cols; } - - /* - * Returns total number of bytes occupied by the backing store ElementT[]. - **/ - auto bytesize() const noexcept -> size_t { return size() * sizeof(ElementT); } - - /* - * Returns true if matrix has (0, 0) dimentions. False otherwise. - **/ - auto is_empty() const noexcept -> size_t { - return (m_layout.m_rows == 0) || (m_layout.m_cols == 0); - } - - /* - * Returns numbers of rows in the matrix. - **/ - auto num_rows() const noexcept -> size_t { return m_layout.m_rows; } - - /* - * Returns numbers of columns in the matrix. - **/ - auto num_cols() const noexcept -> size_t { return m_layout.m_cols; } - - /* - * Linear index into the flat buffer. - **/ - auto operator[](size_t idx) -> ElementT & { return m_host_buf[idx]; } - auto operator[](size_t idx) const -> ElementT const & { return m_host_buf[idx]; } - - /* - * Linear index into the flat buffer. - **/ - auto operator()(size_t idx) -> ElementT & { return m_host_buf[idx]; } - auto operator()(size_t idx) const -> ElementT const & { return m_host_buf[idx]; } - - /* - * Matrix index with major dimention offset. - * Column major for now, but we can add support for changing to row major later - * with some template magic. - */ - auto operator()(size_t row_idx, size_t col_idx) -> ElementT & { - return m_host_buf[m_layout.linearize(row_idx, col_idx)]; - } - - auto operator()(size_t row_idx, size_t col_idx) const -> ElementT const & { - return m_host_buf[m_layout.linearize(row_idx, col_idx)]; - } -}; - -// Element-wise equality test for two matrices of the same template type. -template -inline auto operator==(const Matrix &lhs, const Matrix &rhs) -> bool { - // both dims much match first - if ((lhs.num_rows() != rhs.num_rows()) || (lhs.num_cols() != rhs.num_cols())) { - return false; - } - - for (auto i = 0ull; i < lhs.size(); ++i) { - if (lhs[i] < rhs[i]) { - return false; - } - } - - return true; -} - -// Element-wise inequality test for two matrices of the same template type. -template -inline auto operator!=(const Matrix &lhs, const Matrix &rhs) -> bool { - return !(lhs == rhs); -} - -// Prints matrix to stdout; prefer using this only for small matrices. -template -inline auto operator<<(std::ostream &os, const Matrix &mat) -> std::ostream & { - for (auto row_idx = 0ull; row_idx < mat.num_rows(); ++row_idx) { - os << '[' << mat(row_idx, 0); - - for (auto col_idx = 1ull; col_idx < mat.num_cols() - 1; ++col_idx) { - os << ", " << mat(row_idx, col_idx); - } - - os << "]\n"; - } - - return os; -} - -} // namespace fwgpu diff --git a/test/regress/include/fwgpu/cpu_srgemm.hpp b/test/regress/include/fwgpu/cpu_srgemm.hpp deleted file mode 100644 index 85b7d98..0000000 --- a/test/regress/include/fwgpu/cpu_srgemm.hpp +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -namespace fwgpu { - -template -inline auto cpu_srgemm_naive( - int m, int n, int k, const T *A, int lda, const T *B, int ldb, T *C, int ldc) - -> void { - for (int row = 0; row < m; ++row) { - for (int col = 0; col < n; ++col) { - T mindist = C[row + (col * ldc)]; - for (int i = 0; i < k; ++i) { - mindist = std::min(mindist, A[row + (i * lda)] + B[i + (col * ldb)]); - } - C[row + (col * ldc)] = mindist; - } - } -} - -template -inline auto cpu_fwgemm_naive( - int m, - int n, - int k, - const TData *A, - int lda, - const TData *B, - int ldb, - TData *dist, - int ldc, - TIdx *parent) -> void { - for (int row = 0; row < n; ++row) { - for (int col = 0; col < n; ++col) { - // dist and parent for this vertex pair (i, j) - TData curr_dist = dist[row + (col * ldc)]; - TIdx curr_parent = parent[row + (col * ldc)]; - for (int k = 0; k < n; ++k) { - TData prod = A[row + (k * lda)] + B[k + (col * ldb)]; - if (prod < curr_dist) { - curr_dist = prod; - curr_parent = k; - } - } - dist[row + (col * ldc)] = curr_dist; - parent[row + (col * ldc)] = curr_parent; - } - } -} - -} // namespace fwgpu diff --git a/test/regress/include/fwgpu/gpu_srgemm.cuh b/test/regress/include/fwgpu/gpu_srgemm.cuh deleted file mode 100644 index 0ba19b3..0000000 --- a/test/regress/include/fwgpu/gpu_srgemm.cuh +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include - -namespace fwgpu { - -template -__global__ auto -gpu_srgemm_naive(int m, int n, int k, T *A, int lda, T *B, int ldb, T *dist, int ldc) - -> void { - size_t ty = blockIdx.y * blockDim.y + threadIdx.y; - size_t tx = blockIdx.x * blockDim.x + threadIdx.x; - - size_t n_idx = ty; - while (n_idx < n) { - size_t m_idx = tx; - while (m_idx < m) { - // initialize current minimum distance - T mindist = dist[(n_idx * ldc) + m_idx]; - for (size_t k_idx = 0; k_idx < k; ++k_idx) { - // calculate the distance between n_idx->m_idx by going through k_idx - T thisone = A[(k_idx * lda) + m_idx] + B[(n_idx * ldb) + k_idx]; - if (thisone < mindist) { - mindist = thisone; - } - } - // finally, store new min distance to dist matrix - dist[(n_idx * ldc) + m_idx] = mindist; - m_idx += gridDim.x * blockDim.x; - } - n_idx += gridDim.y * blockDim.y; - } -} - -} // namespace fwgpu diff --git a/test/regress/include/fwgpu/gpu_srgemm.hpp b/test/regress/include/fwgpu/gpu_srgemm.hpp deleted file mode 100644 index 366e812..0000000 --- a/test/regress/include/fwgpu/gpu_srgemm.hpp +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -namespace fwgpu { - -// Cutlass semiring gemm based on {sum, min} as ring operators -auto cutlass_srsgemm_nn( - int M, - int N, - int K, - float const *A, - int lda, - float const *B, - int ldb, - float *C, - int ldc, - float *D, - bool do_epilogue_min = true, - void *stream = nullptr) -> int; - -// Cutlass semiring sgemm based on {sum, min} as ring operators -auto cutlass_srsgemm_nn( - int M, - int N, - int K, - float const *A, - int lda, - float const *B, - int ldb, - float *C, - int ldc, - bool do_epilogue_min = true, - void *stream = nullptr) -> int; - -} // namespace fwgpu diff --git a/test/regress/include/fwgpu/utils.hpp b/test/regress/include/fwgpu/utils.hpp deleted file mode 100644 index 372f2c9..0000000 --- a/test/regress/include/fwgpu/utils.hpp +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -namespace fwgpu { - -// device memory allocation of size bytes -auto malloc_device(void **dptr, size_t size) -> int; - -// unified managed memory allocation of size bytes -auto malloc_unified(void **dptr, size_t size) -> int; - -// free cuda allocated memory, managed or unmanaged -auto free_device(void *dptr) -> int; - -// MEMCPY API -// memory copy: device -> host -auto memcpy_d2h(void *dest, const void *src, size_t size) -> int; - -// memory copy: host -> device -auto memcpy_h2d(void *dest, const void *src, size_t size) -> int; - -// memory copy: host -> host -auto memcpy_h2h(void *dest, const void *src, size_t size) -> int; - -// memory copy: device -> device -auto memcpy_d2d(void *dest, const void *src, size_t size) -> int; - -// memory copy: direction inferred based on src and dest. Requires unified memory. -auto memcpy_inferred(void *dest, const void *src, size_t size) -> int; - -auto memcpy_2d_h2d( - void *deset, - size_t dpitch, - const void *src, - size_t spitch, - size_t width, - size_t height) -> int; -auto memcpy_2d_d2h( - void *deset, - size_t dpitch, - const void *src, - size_t spitch, - size_t width, - size_t height) -> int; -auto memcpy_2d_d2d( - void *dest, - size_t dpitch, - const void *src, - size_t spitch, - size_t width, - size_t height) -> int; -auto memcpy_2d_inferred( - void *dest, - size_t dpitch, - const void *src, - size_t spitch, - size_t width, - size_t height) -> int; -} // namespace fwgpu diff --git a/test/regress/src/cutlass_srgemm.cu b/test/regress/src/cutlass_srgemm.cu deleted file mode 100644 index 9a775c7..0000000 --- a/test/regress/src/cutlass_srgemm.cu +++ /dev/null @@ -1,105 +0,0 @@ -#include "fwgpu/gpu_srgemm.hpp" - -#include "cuasr/arch/srmma.h" -#include "cuasr/gemm/device/default_srgemm_configuration.h" -#include "cuasr/gemm/device/srgemm.h" - -#include "cutlass/functional.h" - -namespace fwgpu { - -auto cutlass_srsgemm_nn( - int M, - int N, - int K, - float const *A, - int lda, - float const *B, - int ldb, - float *C, - int ldc, - float *D, - bool do_epilogue_min, - void *stream) -> int { - cudaStream_t stream_ = nullptr; - if (stream) { - stream_ = *(static_cast(stream)); - } - // compile time configuration of this srgemm kernel - using OperatorClass = cutlass::arch::OpClassSimt; - using SmArch = cutlass::arch::Sm50; - using TropicalConfig = typename cuasr::gemm::device::DefaultSemiRingConfiguration< - float, float, float, float, OperatorClass, cuasr::minimum, - cuasr::plus, SmArch>; - - using AdditionOp = TropicalConfig::AdditionOp; - using MultiplicationOp = TropicalConfig::MultiplicationOp; - using ColumnMajor = cutlass::layout::ColumnMajor; - using ThreadblockShape = typename TropicalConfig::ThreadblockShape; - using WarpShape = typename TropicalConfig::WarpShape; - using InstructionShape = typename TropicalConfig::InstructionShape; - using EpilogueOutputOp = typename TropicalConfig::EpilogueOutputOp; - using ThreadblockSwizzle = - typename cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; - constexpr int Stages = TropicalConfig::kStages; - constexpr int AlignmentA = TropicalConfig::kAlignmentA; - constexpr int AlignmentB = TropicalConfig::kAlignmentB; - - using cuASR_MinPlus_SGEMM = cuasr::gemm::device::Srgemm< - AdditionOp, // Thread level SemiRing operator - MultiplicationOp, // Thread level SemiRing operator - float, // element type of A - ColumnMajor, // layout of A - float, // element type of B - ColumnMajor, // layout of B - float, // element type of C - ColumnMajor, // layout of C - float, // element type of D - OperatorClass, // Logical operator class (SIMT/Tensor) - SmArch, // cuda architecture - ThreadblockShape, // GEMM shape at CTA level - WarpShape, // GEMM shape at Warp level - InstructionShape, // GEMM shape at thread level - EpilogueOutputOp, // Epilogue operator at thread level - ThreadblockSwizzle, // GEMM threadblock swizzler - Stages, // Pipeline stages for shmem - AlignmentA, // Alignment of A elements - AlignmentB, // Alignment of B elements - false // SplitKSerial - >; - - float alpha = MultiplicationOp::Identity; - float beta - = do_epilogue_min ? MultiplicationOp::Identity : MultiplicationOp::Annihilator; - // construct kernel arguments struct - cuASR_MinPlus_SGEMM::Arguments args( - { M, N, K }, // Problem dimensions - { A, lda }, // Tensor-ref for source matrix A - { B, ldb }, // Tensor-ref for source matrix B - { C, ldc }, // Tensor-ref for source matrix C - { D, ldc }, // Tensor-ref for destination matrix D - { alpha, beta } // True if we perform a final min with source matrix C - ); - - // launch SRGEMM kernel - cuASR_MinPlus_SGEMM minplus_gemm; - cutlass::Status status = minplus_gemm(args, nullptr, stream_); - return static_cast(status); -} - -auto cutlass_srsgemm_nn( - int M, - int N, - int K, - float const *A, - int lda, - float const *B, - int ldb, - float *C, - int ldc, - bool do_epilogue_min, - void *stream) -> int { - return cutlass_srsgemm_nn(M, N, K, A, lda, B, ldb, C, ldc, C, do_epilogue_min, stream); -} - -} // namespace fwgpu diff --git a/test/regress/src/utils.cu b/test/regress/src/utils.cu deleted file mode 100644 index f8ce472..0000000 --- a/test/regress/src/utils.cu +++ /dev/null @@ -1,95 +0,0 @@ -#include "fwgpu/utils.hpp" - -#include - -namespace fwgpu { - -auto malloc_device(void **dptr, size_t size) -> int { - auto retval = static_cast(cudaMalloc(dptr, size)); - return retval; -} - -auto malloc_unified(void **dptr, size_t size) -> int { - auto retval = static_cast(cudaMallocManaged(dptr, size)); - return retval; -} - -auto memcpy_inferred(void *dest, const void *src, size_t size) -> int { - auto retval = static_cast(cudaMemcpy(dest, src, size, cudaMemcpyDefault)); - return retval; -} - -auto free_device(void *dbuf) -> int { - auto retval = static_cast(cudaFree(dbuf)); - return retval; -} - -auto memcpy_d2h(void *dest, const void *src, size_t size) -> int { - auto retval = static_cast(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost)); - return retval; -} - -auto memcpy_h2d(void *dest, const void *src, size_t size) -> int { - auto retval = static_cast(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice)); - return retval; -} - -auto memcpy_h2h(void *dest, const void *src, size_t size) -> int { - auto retval = static_cast(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice)); - return retval; -} - -auto memcpy_d2d(void *dest, const void *src, size_t size) -> int { - auto retval = static_cast(cudaMemcpy(dest, src, size, cudaMemcpyHostToHost)); - return retval; -} - -auto memcpy_2d_h2d( - void *dest, - size_t dpitch, - const void *src, - size_t spitch, - size_t width, - size_t height) -> int { - auto retval = static_cast( - cudaMemcpy2D(dest, dpitch, src, spitch, width, height, cudaMemcpyHostToDevice)); - return retval; -} - -auto memcpy_2d_d2h( - void *dest, - size_t dpitch, - const void *src, - size_t spitch, - size_t width, - size_t height) -> int { - auto retval = static_cast( - cudaMemcpy2D(dest, dpitch, src, spitch, width, height, cudaMemcpyDeviceToHost)); - return retval; -} - -auto memcpy_2d_d2d( - void *dest, - size_t dpitch, - const void *src, - size_t spitch, - size_t width, - size_t height) -> int { - auto retval = static_cast( - cudaMemcpy2D(dest, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice)); - return retval; -} - -auto memcpy_2d_inferred( - void *dest, - size_t dpitch, - const void *src, - size_t spitch, - size_t width, - size_t height) -> int { - auto retval = static_cast( - cudaMemcpy2D(dest, dpitch, src, spitch, width, height, cudaMemcpyDefault)); - return retval; -} - -} // namespace fwgpu diff --git a/test/regress/utils.cuh b/test/regress/utils.cuh deleted file mode 100644 index 765ae65..0000000 --- a/test/regress/utils.cuh +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef cuASR_INTERNAL_UTILS -#define cuASR_INTERNAL_UTILS - -#include - -#include "fwgpu/Matrix.hpp" - -namespace fwgpu { -namespace internal { - -template -inline auto alloc_and_init_device_gemm_mats( - const Matrix &A, const Matrix &B, const Matrix &C) - -> std::tuple { - // allocate for inputs and outputs on device - void *d_A, *d_B, *d_C; - cudaMalloc(&d_A, A.bytesize()); - cudaMalloc(&d_B, B.bytesize()); - cudaMalloc(&d_C, C.bytesize()); - - // copy inputs to device - cudaMemcpy(d_A, A.get_buf(), A.bytesize(), cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B.get_buf(), B.bytesize(), cudaMemcpyHostToDevice); - cudaMemcpy(d_C, C.get_buf(), C.bytesize(), cudaMemcpyHostToDevice); - - return std::make_tuple( - reinterpret_cast(d_A), reinterpret_cast(d_B), reinterpret_cast(d_C)); -} - -template -inline auto dealloc_device_gemm_mats(std::tuple device_ptrs) -> void { - cudaFree(std::get<0>(device_ptrs)); - cudaFree(std::get<1>(device_ptrs)); - cudaFree(std::get<2>(device_ptrs)); -} - -template -inline auto alloc_and_init_device_gemm_mats( - const Matrix &A, const Matrix &B, const Matrix &C, const Matrix &D) - -> std::tuple { - // allocate for inputs and outputs on device - void *d_A, *d_B, *d_C, *d_D; - cudaMalloc(&d_A, A.bytesize()); - cudaMalloc(&d_B, B.bytesize()); - cudaMalloc(&d_C, C.bytesize()); - cudaMalloc(&d_D, D.bytesize()); - - // copy inputs to device - cudaMemcpy(d_A, A.get_buf(), A.bytesize(), cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B.get_buf(), B.bytesize(), cudaMemcpyHostToDevice); - cudaMemcpy(d_C, C.get_buf(), C.bytesize(), cudaMemcpyHostToDevice); - cudaMemcpy(d_D, D.get_buf(), D.bytesize(), cudaMemcpyHostToDevice); - - return std::make_tuple( - reinterpret_cast(d_A), reinterpret_cast(d_B), reinterpret_cast(d_C), - reinterpret_cast(d_D)); -} - -template -inline auto dealloc_device_gemm_mats(std::tuple device_ptrs) -> void { - cudaFree(std::get<0>(device_ptrs)); - cudaFree(std::get<1>(device_ptrs)); - cudaFree(std::get<2>(device_ptrs)); - cudaFree(std::get<3>(device_ptrs)); -} - -} // namespace internal -} // namespace fwgpu - -#endif // cuASR_INTERNAL_UTILS diff --git a/tools/include/cuasr/reference/srgemm/host_srgemm.h b/tools/include/cuasr/reference/srgemm/host_srgemm.h index f4363c6..f45395b 100644 --- a/tools/include/cuasr/reference/srgemm/host_srgemm.h +++ b/tools/include/cuasr/reference/srgemm/host_srgemm.h @@ -16,8 +16,7 @@ namespace host { /// Host side SemiRing GEMM for rank-2 tensors for testing. template < - typename AdditionOp, - typename MultiplicationOp, + typename RingOp, typename ElementA, typename LayoutA, typename ElementB, @@ -57,8 +56,7 @@ struct Srgemm { constexpr int Nblock = 32; ConvertOp convert_op; - AdditionOp add_op; - MultiplicationOp mult_op; + RingOp ring_op; #pragma omp for schedule(static) collapse(2) for (int row_block = 0; row_block < M; row_block += Mblock) { @@ -84,7 +82,7 @@ struct Srgemm { ComputeType compute_a(static_cast(a)); ComputeType compute_b(static_cast(b)); - accum[i][j] = add_op(mult_op(compute_a, compute_b), accum[i][j]); + ring_op.fma(accum[i][j], compute_a, compute_b, accum[i][j]); } } } @@ -97,13 +95,15 @@ struct Srgemm { int col = col_block + j; cutlass::MatrixCoord coord(row, col); if (row < M && col < N) { - auto c = tensor_c.at(coord); - tensor_d.at(coord) = convert_op( // - add_op( // - mult_op(alpha, accum[i][j]), // - mult_op(beta, c) // - ) // + auto c = tensor_c.at(coord); + // clang-format off + tensor_d.at(coord) = convert_op( + ring_op.add( + ring_op.mult(alpha, accum[i][j]), + ring_op.mult(beta, c) + ) ); + // clang-format on } } }